diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 71d0488b32790..7c759a1adc950 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -22,3 +22,12 @@ b9c1b51e45b845debb76d8658edabca70ca56079 # r302496: That is the revert of r302421 ff63090b0e1072bd398b8efef8ae2291613a6ec9 + +# Fix more line endings changed in r320089. NFC. +d8f0e6caa91e230a486c948ab643174e40bdf215 + +# Correct line endings that got mixed up in r320089; NFC. +29dc5ded45447915d96ef7ca3f02acf2232282e0 + +# Remove line-endings added by r320089. NFC. +100a0eedc00b2bf48bcdc6c209c000745a4a0e48 diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp index d6913dfd3c078..90b39347bc9ac 100644 --- a/clang-tools-extra/clang-tidy/ClangTidy.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp @@ -329,12 +329,11 @@ static void setStaticAnalyzerCheckerOpts(const ClangTidyOptions &Opts, AnalyzerOptionsRef AnalyzerOptions) { StringRef AnalyzerPrefix(AnalyzerCheckNamePrefix); for (const auto &Opt : Opts.CheckOptions) { - StringRef OptName(Opt.first); - if (!OptName.startswith(AnalyzerPrefix)) + StringRef OptName(Opt.getKey()); + if (!OptName.consume_front(AnalyzerPrefix)) continue; // Analyzer options are always local options so we can ignore priority. - AnalyzerOptions->Config[OptName.substr(AnalyzerPrefix.size())] = - Opt.second.Value; + AnalyzerOptions->Config[OptName] = Opt.getValue().Value; } } @@ -450,8 +449,8 @@ ClangTidyASTConsumerFactory::CreateASTConsumer( std::vector ClangTidyASTConsumerFactory::getCheckNames() { std::vector CheckNames; for (const auto &CheckFactory : *CheckFactories) { - if (Context.isCheckEnabled(CheckFactory.first)) - CheckNames.push_back(CheckFactory.first); + if (Context.isCheckEnabled(CheckFactory.getKey())) + CheckNames.emplace_back(CheckFactory.getKey()); } #if CLANG_ENABLE_STATIC_ANALYZER diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp index c24b8553999cc..c99931e0aa3a2 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Error.h" +#include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" namespace clang { @@ -72,7 +73,7 @@ llvm::Expected ClangTidyCheck::OptionsView::get(StringRef LocalName) const { const auto &Iter = CheckOptions.find(NamePrefix + LocalName.str()); if (Iter != CheckOptions.end()) - return Iter->second.Value; + return Iter->getValue().Value; return llvm::make_error((NamePrefix + LocalName).str()); } @@ -85,7 +86,7 @@ findPriorityOption(const ClangTidyOptions::OptionMap &Options, StringRef NamePre return IterGlobal; if (IterGlobal == Options.end()) return IterLocal; - if (IterLocal->second.Priority >= IterGlobal->second.Priority) + if (IterLocal->getValue().Priority >= IterGlobal->getValue().Priority) return IterLocal; return IterGlobal; } @@ -94,7 +95,7 @@ llvm::Expected ClangTidyCheck::OptionsView::getLocalOrGlobal(StringRef LocalName) const { auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName); if (Iter != CheckOptions.end()) - return Iter->second.Value; + return Iter->getValue().Value; return llvm::make_error((NamePrefix + LocalName).str()); } @@ -126,7 +127,7 @@ bool ClangTidyCheck::OptionsView::get(StringRef LocalName, llvm::Expected ValueOr = get(LocalName); if (ValueOr) return *ValueOr; - logErrToStdErr(ValueOr.takeError()); + logIfOptionParsingError(ValueOr.takeError()); return Default; } @@ -135,7 +136,7 @@ llvm::Expected ClangTidyCheck::OptionsView::getLocalOrGlobal(StringRef LocalName) const { auto Iter = findPriorityOption(CheckOptions, NamePrefix, LocalName); if (Iter != CheckOptions.end()) - return getAsBool(Iter->second.Value, Iter->first); + return getAsBool(Iter->getValue().Value, Iter->getKey()); return llvm::make_error((NamePrefix + LocalName).str()); } @@ -145,7 +146,7 @@ bool ClangTidyCheck::OptionsView::getLocalOrGlobal(StringRef LocalName, llvm::Expected ValueOr = getLocalOrGlobal(LocalName); if (ValueOr) return *ValueOr; - logErrToStdErr(ValueOr.takeError()); + logIfOptionParsingError(ValueOr.takeError()); return Default; } @@ -168,17 +169,16 @@ void ClangTidyCheck::OptionsView::store( store(Options, LocalName, Value ? StringRef("true") : StringRef("false")); } -llvm::Expected -ClangTidyCheck::OptionsView::getEnumInt(StringRef LocalName, - ArrayRef Mapping, - bool CheckGlobal, bool IgnoreCase) { +llvm::Expected ClangTidyCheck::OptionsView::getEnumInt( + StringRef LocalName, ArrayRef Mapping, bool CheckGlobal, + bool IgnoreCase) const { auto Iter = CheckGlobal ? findPriorityOption(CheckOptions, NamePrefix, LocalName) : CheckOptions.find((NamePrefix + LocalName).str()); if (Iter == CheckOptions.end()) return llvm::make_error((NamePrefix + LocalName).str()); - StringRef Value = Iter->second.Value; + StringRef Value = Iter->getValue().Value; StringRef Closest; unsigned EditDistance = -1; for (const auto &NameAndEnum : Mapping) { @@ -200,18 +200,38 @@ ClangTidyCheck::OptionsView::getEnumInt(StringRef LocalName, } if (EditDistance < 3) return llvm::make_error( - Iter->first, Iter->second.Value, std::string(Closest)); - return llvm::make_error(Iter->first, - Iter->second.Value); + Iter->getKey().str(), Iter->getValue().Value, Closest.str()); + return llvm::make_error(Iter->getKey().str(), + Iter->getValue().Value); } -void ClangTidyCheck::OptionsView::logErrToStdErr(llvm::Error &&Err) { - llvm::logAllUnhandledErrors( - llvm::handleErrors(std::move(Err), - [](const MissingOptionError &) -> llvm::Error { - return llvm::Error::success(); - }), - llvm::errs(), "warning: "); +void ClangTidyCheck::OptionsView::logIfOptionParsingError(llvm::Error &&Err) { + if (auto RemainingErrors = + llvm::handleErrors(std::move(Err), [](const MissingOptionError &) {})) + llvm::logAllUnhandledErrors(std::move(RemainingErrors), + llvm::WithColor::warning()); } + +template <> +Optional ClangTidyCheck::OptionsView::getOptional( + StringRef LocalName) const { + if (auto ValueOr = get(LocalName)) + return *ValueOr; + else + consumeError(ValueOr.takeError()); + return llvm::None; +} + +template <> +Optional +ClangTidyCheck::OptionsView::getOptionalLocalOrGlobal( + StringRef LocalName) const { + if (auto ValueOr = getLocalOrGlobal(LocalName)) + return *ValueOr; + else + consumeError(ValueOr.takeError()); + return llvm::None; +} + } // namespace tidy } // namespace clang diff --git a/clang-tools-extra/clang-tidy/ClangTidyCheck.h b/clang-tools-extra/clang-tidy/ClangTidyCheck.h index 54b7251267524..6237e216656be 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyCheck.h +++ b/clang-tools-extra/clang-tidy/ClangTidyCheck.h @@ -268,7 +268,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { if (llvm::Expected ValueOr = get(LocalName)) return *ValueOr; else - logErrToStdErr(ValueOr.takeError()); + logIfOptionParsingError(ValueOr.takeError()); return Default; } @@ -314,7 +314,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { if (llvm::Expected ValueOr = getLocalOrGlobal(LocalName)) return *ValueOr; else - logErrToStdErr(ValueOr.takeError()); + logIfOptionParsingError(ValueOr.takeError()); return Default; } @@ -330,7 +330,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value, llvm::Expected> - get(StringRef LocalName, bool IgnoreCase = false) { + get(StringRef LocalName, bool IgnoreCase = false) const { if (llvm::Expected ValueOr = getEnumInt(LocalName, typeEraseMapping(), false, IgnoreCase)) return static_cast(*ValueOr); @@ -349,11 +349,11 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value, T> - get(StringRef LocalName, T Default, bool IgnoreCase = false) { + get(StringRef LocalName, T Default, bool IgnoreCase = false) const { if (auto ValueOr = get(LocalName, IgnoreCase)) return *ValueOr; else - logErrToStdErr(ValueOr.takeError()); + logIfOptionParsingError(ValueOr.takeError()); return Default; } @@ -370,8 +370,7 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value, llvm::Expected> - getLocalOrGlobal(StringRef LocalName, - bool IgnoreCase = false) { + getLocalOrGlobal(StringRef LocalName, bool IgnoreCase = false) const { if (llvm::Expected ValueOr = getEnumInt(LocalName, typeEraseMapping(), true, IgnoreCase)) return static_cast(*ValueOr); @@ -391,14 +390,40 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value, T> - getLocalOrGlobal(StringRef LocalName, T Default, bool IgnoreCase = false) { + getLocalOrGlobal(StringRef LocalName, T Default, + bool IgnoreCase = false) const { if (auto ValueOr = getLocalOrGlobal(LocalName, IgnoreCase)) return *ValueOr; else - logErrToStdErr(ValueOr.takeError()); + logIfOptionParsingError(ValueOr.takeError()); return Default; } + /// Returns the value for the option \p LocalName represented as a ``T``. + /// If the option is missing returns None, if the option can't be parsed + /// as a ``T``, log that to stderr and return None. + template + llvm::Optional getOptional(StringRef LocalName) const { + if (auto ValueOr = get(LocalName)) + return *ValueOr; + else + logIfOptionParsingError(ValueOr.takeError()); + return llvm::None; + } + + /// Returns the value for the local or global option \p LocalName + /// represented as a ``T``. + /// If the option is missing returns None, if the + /// option can't be parsed as a ``T``, log that to stderr and return None. + template + llvm::Optional getOptionalLocalOrGlobal(StringRef LocalName) const { + if (auto ValueOr = getLocalOrGlobal(LocalName)) + return *ValueOr; + else + logIfOptionParsingError(ValueOr.takeError()); + return llvm::None; + } + /// Stores an option with the check-local name \p LocalName with /// string value \p Value to \p Options. void store(ClangTidyOptions::OptionMap &Options, StringRef LocalName, @@ -420,7 +445,8 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { /// supply the mapping required to convert between ``T`` and a string. template std::enable_if_t::value> - store(ClangTidyOptions::OptionMap &Options, StringRef LocalName, T Value) { + store(ClangTidyOptions::OptionMap &Options, StringRef LocalName, + T Value) const { ArrayRef> Mapping = OptionEnumMapping::getEnumMapping(); auto Iter = llvm::find_if( @@ -436,11 +462,11 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { llvm::Expected getEnumInt(StringRef LocalName, ArrayRef Mapping, - bool CheckGlobal, bool IgnoreCase); + bool CheckGlobal, bool IgnoreCase) const; template std::enable_if_t::value, std::vector> - typeEraseMapping() { + typeEraseMapping() const { ArrayRef> Mapping = OptionEnumMapping::getEnumMapping(); std::vector Result; @@ -455,7 +481,8 @@ class ClangTidyCheck : public ast_matchers::MatchFinder::MatchCallback { void storeInt(ClangTidyOptions::OptionMap &Options, StringRef LocalName, int64_t Value) const; - static void logErrToStdErr(llvm::Error &&Err); + /// Logs an Error to stderr if a \p Err is not a MissingOptionError. + static void logIfOptionParsingError(llvm::Error &&Err); std::string NamePrefix; const ClangTidyOptions::OptionMap &CheckOptions; @@ -523,6 +550,19 @@ void ClangTidyCheck::OptionsView::store( ClangTidyOptions::OptionMap &Options, StringRef LocalName, bool Value) const; +/// Returns the value for the option \p LocalName. +/// If the option is missing returns None. +template <> +Optional ClangTidyCheck::OptionsView::getOptional( + StringRef LocalName) const; + +/// Returns the value for the local or global option \p LocalName. +/// If the option is missing returns None. +template <> +Optional +ClangTidyCheck::OptionsView::getOptionalLocalOrGlobal( + StringRef LocalName) const; + } // namespace tidy } // namespace clang diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp index 521e6ef549b96..1471301a34317 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Regex.h" #include @@ -590,6 +591,7 @@ void ClangTidyDiagnosticConsumer::removeIncompatibleErrors() { // An event can be either the begin or the end of an interval. enum EventType { ET_Begin = 1, + ET_Insert = 0, ET_End = -1, }; @@ -621,10 +623,17 @@ void ClangTidyDiagnosticConsumer::removeIncompatibleErrors() { // one will be processed before, disallowing the second one, and the // end point of the first one will also be processed before, // disallowing the first one. - if (Type == ET_Begin) + switch (Type) { + case ET_Begin: Priority = std::make_tuple(Begin, Type, -End, -ErrorSize, ErrorId); - else + break; + case ET_Insert: + Priority = std::make_tuple(Begin, Type, -End, ErrorSize, ErrorId); + break; + case ET_End: Priority = std::make_tuple(End, Type, -Begin, ErrorSize, ErrorId); + break; + } } bool operator<(const Event &Other) const { @@ -662,19 +671,19 @@ void ClangTidyDiagnosticConsumer::removeIncompatibleErrors() { } // Build events from error intervals. - std::map> FileEvents; + llvm::StringMap> FileEvents; for (unsigned I = 0; I < ErrorFixes.size(); ++I) { for (const auto &FileAndReplace : *ErrorFixes[I].second) { for (const auto &Replace : FileAndReplace.second) { unsigned Begin = Replace.getOffset(); unsigned End = Begin + Replace.getLength(); - const std::string &FilePath = std::string(Replace.getFilePath()); - // FIXME: Handle empty intervals, such as those from insertions. - if (Begin == End) - continue; - auto &Events = FileEvents[FilePath]; - Events.emplace_back(Begin, End, Event::ET_Begin, I, Sizes[I]); - Events.emplace_back(Begin, End, Event::ET_End, I, Sizes[I]); + auto &Events = FileEvents[Replace.getFilePath()]; + if (Begin == End) { + Events.emplace_back(Begin, End, Event::ET_Insert, I, Sizes[I]); + } else { + Events.emplace_back(Begin, End, Event::ET_Begin, I, Sizes[I]); + Events.emplace_back(Begin, End, Event::ET_End, I, Sizes[I]); + } } } } @@ -686,14 +695,20 @@ void ClangTidyDiagnosticConsumer::removeIncompatibleErrors() { llvm::sort(Events); int OpenIntervals = 0; for (const auto &Event : Events) { - if (Event.Type == Event::ET_End) - --OpenIntervals; - // This has to be checked after removing the interval from the count if it - // is an end event, or before adding it if it is a begin event. - if (OpenIntervals != 0) - Apply[Event.ErrorId] = false; - if (Event.Type == Event::ET_Begin) - ++OpenIntervals; + switch (Event.Type) { + case Event::ET_Begin: + if (OpenIntervals++ != 0) + Apply[Event.ErrorId] = false; + break; + case Event::ET_Insert: + if (OpenIntervals != 0) + Apply[Event.ErrorId] = false; + break; + case Event::ET_End: + if (--OpenIntervals != 0) + Apply[Event.ErrorId] = false; + break; + } } assert(OpenIntervals == 0 && "Amount of begin/end points doesn't match"); } diff --git a/clang-tools-extra/clang-tidy/ClangTidyModule.cpp b/clang-tools-extra/clang-tidy/ClangTidyModule.cpp index e9031d498eeff..c72bcd3e188ce 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyModule.cpp @@ -18,15 +18,15 @@ namespace tidy { void ClangTidyCheckFactories::registerCheckFactory(StringRef Name, CheckFactory Factory) { - Factories[std::string(Name)] = std::move(Factory); + Factories.insert_or_assign(Name, std::move(Factory)); } std::vector> ClangTidyCheckFactories::createChecks(ClangTidyContext *Context) { std::vector> Checks; for (const auto &Factory : Factories) { - if (Context->isCheckEnabled(Factory.first)) - Checks.emplace_back(Factory.second(Factory.first, Context)); + if (Context->isCheckEnabled(Factory.getKey())) + Checks.emplace_back(Factory.getValue()(Factory.getKey(), Context)); } return Checks; } diff --git a/clang-tools-extra/clang-tidy/ClangTidyModule.h b/clang-tools-extra/clang-tidy/ClangTidyModule.h index 31cf4774a885a..7fd16c2a7b3c7 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyModule.h +++ b/clang-tools-extra/clang-tidy/ClangTidyModule.h @@ -69,7 +69,7 @@ class ClangTidyCheckFactories { std::vector> createChecks(ClangTidyContext *Context); - typedef std::map FactoryMap; + typedef llvm::StringMap FactoryMap; FactoryMap::const_iterator begin() const { return Factories.begin(); } FactoryMap::const_iterator end() const { return Factories.end(); } bool empty() const { return Factories.empty(); } diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp index 4c1f286737817..6b28cb2bdd13d 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp @@ -70,7 +70,7 @@ struct NOptionMap { NOptionMap(IO &, const ClangTidyOptions::OptionMap &OptionMap) { Options.reserve(OptionMap.size()); for (const auto &KeyValue : OptionMap) - Options.emplace_back(KeyValue.first, KeyValue.second.Value); + Options.emplace_back(std::string(KeyValue.getKey()), KeyValue.getValue().Value); } ClangTidyOptions::OptionMap denormalize(IO &) { ClangTidyOptions::OptionMap Map; @@ -114,11 +114,9 @@ ClangTidyOptions ClangTidyOptions::getDefaults() { Options.SystemHeaders = false; Options.FormatStyle = "none"; Options.User = llvm::None; - unsigned Priority = 0; for (const ClangTidyModuleRegistry::entry &Module : ClangTidyModuleRegistry::entries()) - Options = - Options.mergeWith(Module.instantiate()->getModuleOptions(), ++Priority); + Options = Options.mergeWith(Module.instantiate()->getModuleOptions(), 0); return Options; } @@ -159,8 +157,10 @@ ClangTidyOptions ClangTidyOptions::mergeWith(const ClangTidyOptions &Other, mergeVectors(Result.ExtraArgsBefore, Other.ExtraArgsBefore); for (const auto &KeyValue : Other.CheckOptions) { - Result.CheckOptions[KeyValue.first] = ClangTidyValue( - KeyValue.second.Value, KeyValue.second.Priority + Priority); + Result.CheckOptions.insert_or_assign( + KeyValue.getKey(), + ClangTidyValue(KeyValue.getValue().Value, + KeyValue.getValue().Priority + Priority)); } return Result; diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.h b/clang-tools-extra/clang-tidy/ClangTidyOptions.h index 0f3c1d413ec30..6bfcae0162e20 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyOptions.h +++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.h @@ -16,7 +16,6 @@ #include "llvm/Support/ErrorOr.h" #include "llvm/Support/VirtualFileSystem.h" #include -#include #include #include #include @@ -108,7 +107,7 @@ struct ClangTidyOptions { unsigned Priority; }; typedef std::pair StringPair; - typedef std::map OptionMap; + typedef llvm::StringMap OptionMap; /// Key-value mapping used to store check-specific options. OptionMap CheckOptions; diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp index 11bbcbcb527f5..e775fc21d2d0f 100644 --- a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp +++ b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.cpp @@ -26,8 +26,8 @@ StringFindStartswithCheck::StringFindStartswithCheck(StringRef Name, : ClangTidyCheck(Name, Context), StringLikeClasses(utils::options::parseStringList( Options.get("StringLikeClasses", "::std::basic_string"))), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), AbseilStringsMatchHeader( Options.get("AbseilStringsMatchHeader", "absl/strings/match.h")) {} @@ -105,23 +105,21 @@ void StringFindStartswithCheck::check(const MatchFinder::MatchResult &Result) { // Create a preprocessor #include FixIt hint (CreateIncludeInsertion checks // whether this already exists). - Diagnostic << IncludeInserter->CreateIncludeInsertion( + Diagnostic << IncludeInserter.createIncludeInsertion( Source.getFileID(ComparisonExpr->getBeginLoc()), AbseilStringsMatchHeader, false); } void StringFindStartswithCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - IncludeInserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(IncludeInserter->CreatePPCallbacks()); + IncludeInserter.registerPreprocessor(PP); } void StringFindStartswithCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "StringLikeClasses", utils::options::serializeStringList(StringLikeClasses)); - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); Options.store(Opts, "AbseilStringsMatchHeader", AbseilStringsMatchHeader); } diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h index d232d3b3efb61..2bb20f78b68ce 100644 --- a/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/StringFindStartswithCheck.h @@ -35,9 +35,8 @@ class StringFindStartswithCheck : public ClangTidyCheck { void storeOptions(ClangTidyOptions::OptionMap &Opts) override; private: - std::unique_ptr IncludeInserter; const std::vector StringLikeClasses; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter IncludeInserter; const std::string AbseilStringsMatchHeader; }; diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp index f1755d3f9b855..3f51ef595b463 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp @@ -26,12 +26,12 @@ AST_MATCHER(VarDecl, isLocalVarDecl) { return Node.isLocalVarDecl(); } InitVariablesCheck::InitVariablesCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), MathHeader(Options.get("MathHeader", "math.h")) {} void InitVariablesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); Options.store(Opts, "MathHeader", MathHeader); } @@ -51,9 +51,7 @@ void InitVariablesCheck::registerMatchers(MatchFinder *Finder) { void InitVariablesCheck::registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - IncludeInserter = - std::make_unique(SM, getLangOpts(), IncludeStyle); - PP->addPPCallbacks(IncludeInserter->CreatePPCallbacks()); + IncludeInserter.registerPreprocessor(PP); } void InitVariablesCheck::check(const MatchFinder::MatchResult &Result) { @@ -104,7 +102,7 @@ void InitVariablesCheck::check(const MatchFinder::MatchResult &Result) { MatchedDecl->getName().size()), InitializationString); if (AddMathInclude) { - Diagnostic << IncludeInserter->CreateIncludeInsertion( + Diagnostic << IncludeInserter.createIncludeInsertion( Source.getFileID(MatchedDecl->getBeginLoc()), MathHeader, false); } } diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h index 61521b118a99e..0f778104ce398 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h @@ -31,8 +31,7 @@ class InitVariablesCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr IncludeInserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter IncludeInserter; const std::string MathHeader; }; diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp index 96b0bb0f9b02d..f45801f1ea723 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.cpp @@ -21,20 +21,18 @@ namespace cppcoreguidelines { ProBoundsConstantArrayIndexCheck::ProBoundsConstantArrayIndexCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), GslHeader(Options.get("GslHeader", "")), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) {} void ProBoundsConstantArrayIndexCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "GslHeader", GslHeader); - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); } void ProBoundsConstantArrayIndexCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void ProBoundsConstantArrayIndexCheck::registerMatchers(MatchFinder *Finder) { @@ -87,9 +85,8 @@ void ProBoundsConstantArrayIndexCheck::check( IndexRange.getBegin().getLocWithOffset(-1)), ", ") << FixItHint::CreateReplacement(Matched->getEndLoc(), ")") - << Inserter->CreateIncludeInsertion( - Result.SourceManager->getMainFileID(), GslHeader, - /*IsAngled=*/false); + << Inserter.createMainFileIncludeInsertion(GslHeader, + /*IsAngled=*/false); } return; } diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h index ac7475b4372db..04a51b93a04c5 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h @@ -23,8 +23,7 @@ namespace cppcoreguidelines { /// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines-pro-bounds-constant-array-index.html class ProBoundsConstantArrayIndexCheck : public ClangTidyCheck { const std::string GslHeader; - const utils::IncludeSorter::IncludeStyle IncludeStyle; - std::unique_ptr Inserter; + utils::IncludeInserter Inserter; public: ProBoundsConstantArrayIndexCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp index c677043946f7f..5818b8cd06b5c 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp @@ -44,8 +44,8 @@ const char MakeSmartPtrCheck::PointerType[] = "pointerType"; MakeSmartPtrCheck::MakeSmartPtrCheck(StringRef Name, ClangTidyContext *Context, StringRef MakeSmartPtrFunctionName) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), MakeSmartPtrFunctionHeader( Options.get("MakeSmartPtrFunctionHeader", StdMemoryHeader)), MakeSmartPtrFunctionName( @@ -53,7 +53,7 @@ MakeSmartPtrCheck::MakeSmartPtrCheck(StringRef Name, ClangTidyContext *Context, IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)) {} void MakeSmartPtrCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); Options.store(Opts, "MakeSmartPtrFunctionHeader", MakeSmartPtrFunctionHeader); Options.store(Opts, "MakeSmartPtrFunction", MakeSmartPtrFunctionName); Options.store(Opts, "IgnoreMacros", IgnoreMacros); @@ -67,9 +67,7 @@ bool MakeSmartPtrCheck::isLanguageVersionSupported( void MakeSmartPtrCheck::registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void MakeSmartPtrCheck::registerMatchers(ast_matchers::MatchFinder *Finder) { @@ -432,7 +430,7 @@ void MakeSmartPtrCheck::insertHeader(DiagnosticBuilder &Diag, FileID FD) { if (MakeSmartPtrFunctionHeader.empty()) { return; } - Diag << Inserter->CreateIncludeInsertion( + Diag << Inserter.createIncludeInsertion( FD, MakeSmartPtrFunctionHeader, /*IsAngled=*/MakeSmartPtrFunctionHeader == StdMemoryHeader); } diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h index 1f738737ab264..7a1bba624c539 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.h @@ -46,8 +46,7 @@ class MakeSmartPtrCheck : public ClangTidyCheck { static const char PointerType[]; private: - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; const std::string MakeSmartPtrFunctionHeader; const std::string MakeSmartPtrFunctionName; const bool IgnoreMacros; diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp index b6dedfbc2b6eb..b955ea7f7572b 100644 --- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.cpp @@ -120,12 +120,12 @@ collectParamDecls(const CXXConstructorDecl *Ctor, PassByValueCheck::PassByValueCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), ValuesOnly(Options.get("ValuesOnly", false)) {} void PassByValueCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); Options.store(Opts, "ValuesOnly", ValuesOnly); } @@ -167,9 +167,7 @@ void PassByValueCheck::registerMatchers(MatchFinder *Finder) { void PassByValueCheck::registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void PassByValueCheck::check(const MatchFinder::MatchResult &Result) { @@ -216,7 +214,7 @@ void PassByValueCheck::check(const MatchFinder::MatchResult &Result) { Diag << FixItHint::CreateInsertion(Initializer->getRParenLoc(), ")") << FixItHint::CreateInsertion( Initializer->getLParenLoc().getLocWithOffset(1), "std::move(") - << Inserter->CreateIncludeInsertion( + << Inserter.createIncludeInsertion( Result.SourceManager->getFileID(Initializer->getSourceLocation()), "utility", /*IsAngled=*/true); diff --git a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h index 7abda91ac5632..82cd9d44c5e3a 100644 --- a/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/PassByValueCheck.h @@ -31,8 +31,7 @@ class PassByValueCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; const bool ValuesOnly; }; diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp index f98254dbf7c83..25ffbe2b8738d 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.cpp @@ -74,11 +74,11 @@ AST_MATCHER(Decl, isFromStdNamespace) { ReplaceAutoPtrCheck::ReplaceAutoPtrCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) {} void ReplaceAutoPtrCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); } void ReplaceAutoPtrCheck::registerMatchers(MatchFinder *Finder) { @@ -131,9 +131,7 @@ void ReplaceAutoPtrCheck::registerMatchers(MatchFinder *Finder) { void ReplaceAutoPtrCheck::registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) { @@ -146,12 +144,11 @@ void ReplaceAutoPtrCheck::check(const MatchFinder::MatchResult &Result) { if (Range.isInvalid()) return; - auto Diag = - diag(Range.getBegin(), "use std::move to transfer ownership") - << FixItHint::CreateInsertion(Range.getBegin(), "std::move(") - << FixItHint::CreateInsertion(Range.getEnd(), ")") - << Inserter->CreateIncludeInsertion(SM.getMainFileID(), "utility", - /*IsAngled=*/true); + auto Diag = diag(Range.getBegin(), "use std::move to transfer ownership") + << FixItHint::CreateInsertion(Range.getBegin(), "std::move(") + << FixItHint::CreateInsertion(Range.getEnd(), ")") + << Inserter.createMainFileIncludeInsertion("utility", + /*IsAngled=*/true); return; } diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h index e2b04073a65f4..8288c7e47d35e 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceAutoPtrCheck.h @@ -53,8 +53,7 @@ class ReplaceAutoPtrCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; }; } // namespace modernize diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp index 66917df3e91d2..0191f5d5c5deb 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.cpp @@ -23,8 +23,9 @@ namespace modernize { ReplaceRandomShuffleCheck::ReplaceRandomShuffleCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) { +} void ReplaceRandomShuffleCheck::registerMatchers(MatchFinder *Finder) { const auto Begin = hasArgument(0, expr()); @@ -44,14 +45,12 @@ void ReplaceRandomShuffleCheck::registerMatchers(MatchFinder *Finder) { void ReplaceRandomShuffleCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - IncludeInserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(IncludeInserter->CreatePPCallbacks()); + IncludeInserter.registerPreprocessor(PP); } void ReplaceRandomShuffleCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); } void ReplaceRandomShuffleCheck::check(const MatchFinder::MatchResult &Result) { @@ -92,7 +91,7 @@ void ReplaceRandomShuffleCheck::check(const MatchFinder::MatchResult &Result) { Diag << FixItHint::CreateRemoval(MatchedDecl->getSourceRange()); Diag << FixItHint::CreateInsertion(MatchedDecl->getBeginLoc(), NewName); - Diag << IncludeInserter->CreateIncludeInsertion( + Diag << IncludeInserter.createIncludeInsertion( Result.Context->getSourceManager().getFileID( MatchedCallExpr->getBeginLoc()), "random", /*IsAngled=*/true); diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h index c4ac74d6e662a..990dcffc79a6a 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h @@ -34,8 +34,7 @@ class ReplaceRandomShuffleCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr IncludeInserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter IncludeInserter; }; } // namespace modernize diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp index 4cbb014867c4d..6b42cd3180d73 100644 --- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.cpp @@ -23,8 +23,8 @@ namespace performance { MoveConstructorInitCheck::MoveConstructorInitCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) {} void MoveConstructorInitCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( @@ -90,13 +90,11 @@ void MoveConstructorInitCheck::check(const MatchFinder::MatchResult &Result) { void MoveConstructorInitCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void MoveConstructorInitCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); } } // namespace performance diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h index 0473978f29db4..0b637b617782e 100644 --- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h +++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h @@ -36,8 +36,7 @@ class MoveConstructorInitCheck : public ClangTidyCheck { void storeOptions(ClangTidyOptions::OptionMap &Opts) override; private: - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; }; } // namespace performance diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp index 597445d0fc266..2105aa9947bb0 100644 --- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.cpp @@ -31,19 +31,18 @@ AST_MATCHER_P(Type, isBuiltinType, BuiltinType::Kind, Kind) { TypePromotionInMathFnCheck::TypePromotionInMathFnCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)) {} + IncludeInserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)) { +} void TypePromotionInMathFnCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - IncludeInserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(IncludeInserter->CreatePPCallbacks()); + IncludeInserter.registerPreprocessor(PP); } void TypePromotionInMathFnCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", IncludeInserter.getStyle()); } void TypePromotionInMathFnCheck::registerMatchers(MatchFinder *Finder) { @@ -191,7 +190,7 @@ void TypePromotionInMathFnCheck::check(const MatchFinder::MatchResult &Result) { // , because the functions we're suggesting moving away from are all // declared in . if (FnInCmath) - Diag << IncludeInserter->CreateIncludeInsertion( + Diag << IncludeInserter.createIncludeInsertion( Result.Context->getSourceManager().getFileID(Call->getBeginLoc()), "cmath", /*IsAngled=*/true); } diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h index d1cc042ca6a35..dd7c1c090bad3 100644 --- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h +++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h @@ -36,8 +36,7 @@ class TypePromotionInMathFnCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; private: - std::unique_ptr IncludeInserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter IncludeInserter; }; } // namespace performance diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp index 5de53b1840f12..9aef5a8681694 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp @@ -68,8 +68,8 @@ bool isExplicitTemplateSpecialization(const FunctionDecl &Function) { UnnecessaryValueParamCheck::UnnecessaryValueParamCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - utils::IncludeSorter::IS_LLVM)), + Inserter(Options.getLocalOrGlobal("IncludeStyle", + utils::IncludeSorter::IS_LLVM)), AllowedTypes( utils::options::parseStringList(Options.get("AllowedTypes", ""))) {} @@ -173,14 +173,12 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) { void UnnecessaryValueParamCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - Inserter = std::make_unique(SM, getLangOpts(), - IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void UnnecessaryValueParamCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); Options.store(Opts, "AllowedTypes", utils::options::serializeStringList(AllowedTypes)); } @@ -204,7 +202,7 @@ void UnnecessaryValueParamCheck::handleMoveFix(const ParmVarDecl &Var, Context.getLangOpts()); Diag << FixItHint::CreateInsertion(CopyArgument.getBeginLoc(), "std::move(") << FixItHint::CreateInsertion(EndLoc, ")") - << Inserter->CreateIncludeInsertion( + << Inserter.createIncludeInsertion( SM.getFileID(CopyArgument.getBeginLoc()), "utility", /*IsAngled=*/true); } diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h index 1d2367148ef82..a84079e7da229 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h @@ -41,8 +41,7 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck { llvm::DenseMap MutationAnalyzers; - std::unique_ptr Inserter; - const utils::IncludeSorter::IncludeStyle IncludeStyle; + utils::IncludeInserter Inserter; const std::vector AllowedTypes; }; diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp index c885aac89072a..c2a32474b2a8b 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp @@ -8,6 +8,7 @@ #include "IdentifierNamingCheck.h" +#include "../GlobList.h" #include "clang/AST/CXXInheritance.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/Preprocessor.h" @@ -15,7 +16,8 @@ #include "llvm/ADT/DenseMapInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" -#include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/Path.h" #include "llvm/Support/Regex.h" #define DEBUG_TYPE "clang-tidy" @@ -119,54 +121,59 @@ static StringRef const StyleNames[] = { #undef NAMING_KEYS // clang-format on +static std::vector> +getNamingStyles(const ClangTidyCheck::OptionsView &Options) { + std::vector> Styles; + Styles.reserve(array_lengthof(StyleNames)); + for (auto const &StyleName : StyleNames) { + auto CaseOptional = Options.getOptional( + (StyleName + "Case").str()); + auto Prefix = Options.get((StyleName + "Prefix").str(), ""); + auto Postfix = Options.get((StyleName + "Suffix").str(), ""); + + if (CaseOptional || !Prefix.empty() || !Postfix.empty()) + Styles.emplace_back(IdentifierNamingCheck::NamingStyle{ + std::move(CaseOptional), std::move(Prefix), std::move(Postfix)}); + else + Styles.emplace_back(llvm::None); + } + return Styles; +} + IdentifierNamingCheck::IdentifierNamingCheck(StringRef Name, ClangTidyContext *Context) - : RenamerClangTidyCheck(Name, Context), + : RenamerClangTidyCheck(Name, Context), Context(Context), CheckName(Name), + GetConfigPerFile(Options.get("GetConfigPerFile", true)), IgnoreFailedSplit(Options.get("IgnoreFailedSplit", false)), IgnoreMainLikeFunctions(Options.get("IgnoreMainLikeFunctions", false)) { - for (auto const &Name : StyleNames) { - auto CaseOptional = [&]() -> llvm::Optional { - auto ValueOr = Options.get((Name + "Case").str()); - if (ValueOr) - return *ValueOr; - llvm::logAllUnhandledErrors( - llvm::handleErrors(ValueOr.takeError(), - [](const MissingOptionError &) -> llvm::Error { - return llvm::Error::success(); - }), - llvm::errs(), "warning: "); - return llvm::None; - }(); - - auto prefix = Options.get((Name + "Prefix").str(), ""); - auto postfix = Options.get((Name + "Suffix").str(), ""); - - if (CaseOptional || !prefix.empty() || !postfix.empty()) { - NamingStyles.push_back(NamingStyle(CaseOptional, prefix, postfix)); - } else { - NamingStyles.push_back(llvm::None); - } - } + auto IterAndInserted = NamingStylesCache.try_emplace( + llvm::sys::path::parent_path(Context->getCurrentFile()), + getNamingStyles(Options)); + assert(IterAndInserted.second && "Couldn't insert Style"); + // Holding a reference to the data in the vector is safe as it should never + // move. + MainFileStyle = IterAndInserted.first->getValue(); } IdentifierNamingCheck::~IdentifierNamingCheck() = default; void IdentifierNamingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { RenamerClangTidyCheck::storeOptions(Opts); - for (size_t i = 0; i < SK_Count; ++i) { - if (NamingStyles[i]) { - if (NamingStyles[i]->Case) { - Options.store(Opts, (StyleNames[i] + "Case").str(), - *NamingStyles[i]->Case); - } - Options.store(Opts, (StyleNames[i] + "Prefix").str(), - NamingStyles[i]->Prefix); - Options.store(Opts, (StyleNames[i] + "Suffix").str(), - NamingStyles[i]->Suffix); - } + ArrayRef> NamingStyles = + getStyleForFile(Context->getCurrentFile()); + for (size_t I = 0; I < SK_Count; ++I) { + if (!NamingStyles[I]) + continue; + if (NamingStyles[I]->Case) + Options.store(Opts, (StyleNames[I] + "Case").str(), + *NamingStyles[I]->Case); + Options.store(Opts, (StyleNames[I] + "Prefix").str(), + NamingStyles[I]->Prefix); + Options.store(Opts, (StyleNames[I] + "Suffix").str(), + NamingStyles[I]->Suffix); } - + Options.store(Opts, "GetConfigPerFile", GetConfigPerFile); Options.store(Opts, "IgnoreFailedSplit", IgnoreFailedSplit); Options.store(Opts, "IgnoreMainLikeFunctions", IgnoreMainLikeFunctions); } @@ -183,14 +190,9 @@ static bool matchesStyle(StringRef Name, llvm::Regex("^[a-z]([a-z0-9]*(_[A-Z])?)*"), }; - if (Name.startswith(Style.Prefix)) - Name = Name.drop_front(Style.Prefix.size()); - else + if (!Name.consume_front(Style.Prefix)) return false; - - if (Name.endswith(Style.Suffix)) - Name = Name.drop_back(Style.Suffix.size()); - else + if (!Name.consume_back(Style.Suffix)) return false; // Ensure the name doesn't have any extra underscores beyond those specified @@ -213,9 +215,10 @@ static std::string fixupWithCase(StringRef Name, Name.split(Substrs, "_", -1, false); SmallVector Words; + SmallVector Groups; for (auto Substr : Substrs) { while (!Substr.empty()) { - SmallVector Groups; + Groups.clear(); if (!Splitter.match(Substr, &Groups)) break; @@ -233,12 +236,12 @@ static std::string fixupWithCase(StringRef Name, } if (Words.empty()) - return std::string(Name); + return Name.str(); - std::string Fixup; + SmallString<128> Fixup; switch (Case) { case IdentifierNamingCheck::CT_AnyCase: - Fixup += Name; + return Name.str(); break; case IdentifierNamingCheck::CT_LowerCase: @@ -259,7 +262,7 @@ static std::string fixupWithCase(StringRef Name, case IdentifierNamingCheck::CT_CamelCase: for (auto const &Word : Words) { - Fixup += Word.substr(0, 1).upper(); + Fixup += toupper(Word.front()); Fixup += Word.substr(1).lower(); } break; @@ -269,7 +272,7 @@ static std::string fixupWithCase(StringRef Name, if (&Word == &Words.front()) { Fixup += Word.lower(); } else { - Fixup += Word.substr(0, 1).upper(); + Fixup += toupper(Word.front()); Fixup += Word.substr(1).lower(); } } @@ -279,7 +282,7 @@ static std::string fixupWithCase(StringRef Name, for (auto const &Word : Words) { if (&Word != &Words.front()) Fixup += "_"; - Fixup += Word.substr(0, 1).upper(); + Fixup += toupper(Word.front()); Fixup += Word.substr(1).lower(); } break; @@ -288,16 +291,16 @@ static std::string fixupWithCase(StringRef Name, for (auto const &Word : Words) { if (&Word != &Words.front()) { Fixup += "_"; - Fixup += Word.substr(0, 1).upper(); + Fixup += toupper(Word.front()); } else { - Fixup += Word.substr(0, 1).lower(); + Fixup += tolower(Word.front()); } Fixup += Word.substr(1).lower(); } break; } - return Fixup; + return Fixup.str().str(); } static bool isParamInMainLikeFunction(const ParmVarDecl &ParmDecl, @@ -374,8 +377,7 @@ fixupWithStyle(StringRef Name, static StyleKind findStyleKind( const NamedDecl *D, - const std::vector> - &NamingStyles, + ArrayRef> NamingStyles, bool IgnoreMainLikeFunctions) { assert(D && D->getIdentifier() && !D->getName().empty() && !D->isImplicit() && "Decl must be an explicit identifier with a name."); @@ -652,74 +654,82 @@ static StyleKind findStyleKind( return SK_Invalid; } -llvm::Optional -IdentifierNamingCheck::GetDeclFailureInfo(const NamedDecl *Decl, - const SourceManager &SM) const { - StyleKind SK = findStyleKind(Decl, NamingStyles, IgnoreMainLikeFunctions); - if (SK == SK_Invalid) - return None; - - if (!NamingStyles[SK]) +static llvm::Optional getFailureInfo( + StringRef Name, SourceLocation Location, + ArrayRef> NamingStyles, + StyleKind SK, const SourceManager &SM, bool IgnoreFailedSplit) { + if (SK == SK_Invalid || !NamingStyles[SK]) return None; - const NamingStyle &Style = *NamingStyles[SK]; - StringRef Name = Decl->getName(); + const IdentifierNamingCheck::NamingStyle &Style = *NamingStyles[SK]; if (matchesStyle(Name, Style)) return None; - std::string KindName = fixupWithCase(StyleNames[SK], CT_LowerCase); + std::string KindName = + fixupWithCase(StyleNames[SK], IdentifierNamingCheck::CT_LowerCase); std::replace(KindName.begin(), KindName.end(), '_', ' '); std::string Fixup = fixupWithStyle(Name, Style); if (StringRef(Fixup).equals(Name)) { if (!IgnoreFailedSplit) { - LLVM_DEBUG(llvm::dbgs() - << Decl->getBeginLoc().printToString(SM) - << llvm::format(": unable to split words for %s '%s'\n", - KindName.c_str(), Name.str().c_str())); + LLVM_DEBUG(Location.print(llvm::dbgs(), SM); + llvm::dbgs() + << llvm::formatv(": unable to split words for {0} '{1}'\n", + KindName, Name)); } return None; } - return FailureInfo{std::move(KindName), std::move(Fixup)}; + return RenamerClangTidyCheck::FailureInfo{std::move(KindName), + std::move(Fixup)}; +} + +llvm::Optional +IdentifierNamingCheck::GetDeclFailureInfo(const NamedDecl *Decl, + const SourceManager &SM) const { + SourceLocation Loc = Decl->getLocation(); + ArrayRef> NamingStyles = + getStyleForFile(SM.getFilename(Loc)); + + return getFailureInfo( + Decl->getName(), Loc, NamingStyles, + findStyleKind(Decl, NamingStyles, IgnoreMainLikeFunctions), SM, + IgnoreFailedSplit); } llvm::Optional IdentifierNamingCheck::GetMacroFailureInfo(const Token &MacroNameTok, const SourceManager &SM) const { - if (!NamingStyles[SK_MacroDefinition]) - return None; - - StringRef Name = MacroNameTok.getIdentifierInfo()->getName(); - const NamingStyle &Style = *NamingStyles[SK_MacroDefinition]; - if (matchesStyle(Name, Style)) - return None; - - std::string KindName = - fixupWithCase(StyleNames[SK_MacroDefinition], CT_LowerCase); - std::replace(KindName.begin(), KindName.end(), '_', ' '); + SourceLocation Loc = MacroNameTok.getLocation(); - std::string Fixup = fixupWithStyle(Name, Style); - if (StringRef(Fixup).equals(Name)) { - if (!IgnoreFailedSplit) { - LLVM_DEBUG(llvm::dbgs() - << MacroNameTok.getLocation().printToString(SM) - << llvm::format(": unable to split words for %s '%s'\n", - KindName.c_str(), Name.str().c_str())); - } - return None; - } - return FailureInfo{std::move(KindName), std::move(Fixup)}; + return getFailureInfo(MacroNameTok.getIdentifierInfo()->getName(), Loc, + getStyleForFile(SM.getFilename(Loc)), + SK_MacroDefinition, SM, IgnoreFailedSplit); } RenamerClangTidyCheck::DiagInfo IdentifierNamingCheck::GetDiagInfo(const NamingCheckId &ID, const NamingCheckFailure &Failure) const { return DiagInfo{"invalid case style for %0 '%1'", - [&](DiagnosticBuilder &diag) { - diag << Failure.Info.KindName << ID.second; + [&](DiagnosticBuilder &Diag) { + Diag << Failure.Info.KindName << ID.second; }}; } +ArrayRef> +IdentifierNamingCheck::getStyleForFile(StringRef FileName) const { + if (!GetConfigPerFile) + return MainFileStyle; + auto &Styles = NamingStylesCache[llvm::sys::path::parent_path(FileName)]; + if (Styles.empty()) { + ClangTidyOptions Options = Context->getOptionsForFile(FileName); + if (Options.Checks && GlobList(*Options.Checks).contains(CheckName)) + Styles = getNamingStyles({CheckName, Options.CheckOptions}); + else + Styles.resize(SK_Count, None); + } + return Styles; +} + } // namespace readability } // namespace tidy } // namespace clang diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h index 0f6c77b2c9a86..ad1c582d100bc 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h +++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h @@ -10,6 +10,7 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_IDENTIFIERNAMINGCHECK_H #include "../utils/RenamerClangTidyCheck.h" +#include "llvm/ADT/Optional.h" namespace clang { class MacroInfo; @@ -69,7 +70,17 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck { DiagInfo GetDiagInfo(const NamingCheckId &ID, const NamingCheckFailure &Failure) const override; - std::vector> NamingStyles; + ArrayRef> + getStyleForFile(StringRef FileName) const; + + /// Stores the style options as a vector, indexed by the specified \ref + /// StyleKind, for a given directory. + mutable llvm::StringMap>> + NamingStylesCache; + ArrayRef> MainFileStyle; + ClangTidyContext *const Context; + const std::string CheckName; + const bool GetConfigPerFile; const bool IgnoreFailedSplit; const bool IgnoreMainLikeFunctions; }; diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp index bea02a6ba111b..1f371eed2db80 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp @@ -92,16 +92,18 @@ void RedundantStringCStrCheck::registerMatchers( callee(memberExpr().bind("member")), callee(cxxMethodDecl(hasAnyName("c_str", "data")))) .bind("call"); - + const auto HasRValueTempParent = + hasParent(materializeTemporaryExpr(unless(isBoundToLValue()))); // Detect redundant 'c_str()' calls through a string constructor. // If CxxConstructExpr is the part of some CallExpr we need to // check that matched ParamDecl of the ancestor CallExpr is not rvalue. Finder->addMatcher( - traverse(ast_type_traits::TK_AsIs, - cxxConstructExpr(StringConstructorExpr, - hasArgument(0, StringCStrCallExpr), - unless(hasParent(materializeTemporaryExpr( - unless(isBoundToLValue())))))), + traverse( + ast_type_traits::TK_AsIs, + cxxConstructExpr( + StringConstructorExpr, hasArgument(0, StringCStrCallExpr), + unless(anyOf(HasRValueTempParent, hasParent(cxxBindTemporaryExpr( + HasRValueTempParent)))))), this); // Detect: 's == str.c_str()' -> 's == str' diff --git a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp index 68bb987c1275f..1cae618dfd093 100644 --- a/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp +++ b/clang-tools-extra/clang-tidy/utils/HeaderGuard.cpp @@ -123,12 +123,7 @@ class HeaderGuardPPCallbacks : public PPCallbacks { // Emit warnings for headers that are missing guards. checkGuardlessHeaders(); - - // Clear all state. - Macros.clear(); - Files.clear(); - Ifndefs.clear(); - EndIfs.clear(); + clearAllState(); } bool wouldFixEndifComment(StringRef FileName, SourceLocation EndIf, @@ -255,6 +250,13 @@ class HeaderGuardPPCallbacks : public PPCallbacks { } private: + void clearAllState() { + Macros.clear(); + Files.clear(); + Ifndefs.clear(); + EndIfs.clear(); + } + std::vector> Macros; llvm::StringMap Files; std::map> diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp index df87dbe49cff8..268692c3ba42e 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp +++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "IncludeInserter.h" +#include "clang/Lex/PPCallbacks.h" +#include "clang/Lex/Preprocessor.h" #include "clang/Lex/Token.h" namespace clang { @@ -26,7 +28,7 @@ class IncludeInserterCallback : public PPCallbacks { StringRef /*SearchPath*/, StringRef /*RelativePath*/, const Module * /*ImportedModule*/, SrcMgr::CharacteristicKind /*FileType*/) override { - Inserter->AddInclude(FileNameRef, IsAngled, HashLocation, + Inserter->addInclude(FileNameRef, IsAngled, HashLocation, IncludeToken.getEndLoc()); } @@ -34,45 +36,61 @@ class IncludeInserterCallback : public PPCallbacks { IncludeInserter *Inserter; }; -IncludeInserter::IncludeInserter(const SourceManager &SourceMgr, - const LangOptions &LangOpts, - IncludeSorter::IncludeStyle Style) - : SourceMgr(SourceMgr), Style(Style) {} +IncludeInserter::IncludeInserter(IncludeSorter::IncludeStyle Style) + : Style(Style) {} -IncludeInserter::~IncludeInserter() {} +void IncludeInserter::registerPreprocessor(Preprocessor *PP) { + assert(PP && "PP shouldn't be null"); + SourceMgr = &PP->getSourceManager(); -std::unique_ptr IncludeInserter::CreatePPCallbacks() { - return std::make_unique(this); + // If this gets registered multiple times, clear the maps + if (!IncludeSorterByFile.empty()) + IncludeSorterByFile.clear(); + if (!InsertedHeaders.empty()) + InsertedHeaders.clear(); + PP->addPPCallbacks(std::make_unique(this)); } IncludeSorter &IncludeInserter::getOrCreate(FileID FileID) { + assert(SourceMgr && "SourceMgr shouldn't be null; did you remember to call " + "registerPreprocessor()?"); // std::unique_ptr is cheap to construct, so force a construction now to save // the lookup needed if we were to insert into the map. std::unique_ptr &Entry = IncludeSorterByFile[FileID]; if (!Entry) { // If it wasn't found, Entry will be default constructed to nullptr. Entry = std::make_unique( - &SourceMgr, FileID, - SourceMgr.getFilename(SourceMgr.getLocForStartOfFile(FileID)), Style); + SourceMgr, FileID, + SourceMgr->getFilename(SourceMgr->getLocForStartOfFile(FileID)), Style); } return *Entry; } llvm::Optional -IncludeInserter::CreateIncludeInsertion(FileID FileID, StringRef Header, +IncludeInserter::createIncludeInsertion(FileID FileID, StringRef Header, bool IsAngled) { // We assume the same Header will never be included both angled and not // angled. - if (!InsertedHeaders[FileID].insert(std::string(Header)).second) + if (!InsertedHeaders[FileID].insert(Header).second) return llvm::None; return getOrCreate(FileID).CreateIncludeInsertion(Header, IsAngled); } -void IncludeInserter::AddInclude(StringRef FileName, bool IsAngled, +llvm::Optional +IncludeInserter::createMainFileIncludeInsertion(StringRef Header, + bool IsAngled) { + assert(SourceMgr && "SourceMgr shouldn't be null; did you remember to call " + "registerPreprocessor()?"); + return createIncludeInsertion(SourceMgr->getMainFileID(), Header, IsAngled); +} + +void IncludeInserter::addInclude(StringRef FileName, bool IsAngled, SourceLocation HashLocation, SourceLocation EndLocation) { - FileID FileID = SourceMgr.getFileID(HashLocation); + assert(SourceMgr && "SourceMgr shouldn't be null; did you remember to call " + "registerPreprocessor()?"); + FileID FileID = SourceMgr->getFileID(HashLocation); getOrCreate(FileID).AddInclude(FileName, IsAngled, HashLocation, EndLocation); } diff --git a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h index 0d4b951beb1ff..70c36ce8895c4 100644 --- a/clang-tools-extra/clang-tidy/utils/IncludeInserter.h +++ b/clang-tools-extra/clang-tidy/utils/IncludeInserter.h @@ -11,13 +11,11 @@ #include "IncludeSorter.h" #include "clang/Basic/Diagnostic.h" -#include "clang/Basic/LangOptions.h" -#include "clang/Basic/SourceManager.h" -#include "clang/Lex/PPCallbacks.h" +#include "llvm/ADT/StringSet.h" #include -#include namespace clang { +class Preprocessor; namespace tidy { namespace utils { @@ -26,16 +24,17 @@ namespace utils { /// /// ``IncludeInserter`` can be used in clang-tidy checks in the following way: /// \code +/// #include "../ClangTidyCheck.h" /// #include "../utils/IncludeInserter.h" -/// #include "clang/Frontend/CompilerInstance.h" +/// +/// namespace clang { +/// namespace tidy { /// /// class MyCheck : public ClangTidyCheck { /// public: /// void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, /// Preprocessor *ModuleExpanderPP) override { -/// Inserter = std::make_unique( -/// SM, getLangOpts(), utils::IncludeSorter::IS_Google); -/// PP->addPPCallbacks(Inserter->CreatePPCallbacks()); +/// Inserter.registerPreprocessor(); /// } /// /// void registerMatchers(ast_matchers::MatchFinder* Finder) override { ... } @@ -43,39 +42,53 @@ namespace utils { /// void check( /// const ast_matchers::MatchFinder::MatchResult& Result) override { /// ... -/// Inserter->CreateIncludeInsertion( -/// Result.SourceManager->getMainFileID(), "path/to/Header.h", -/// /*IsAngled=*/false); +/// Inserter.createMainFileIncludeInsertion("path/to/Header.h", +/// /*IsAngled=*/false); /// ... /// } /// /// private: -/// std::unique_ptr Inserter; +/// utils::IncludeInserter Inserter{utils::IncludeSorter::IS_Google}; /// }; +/// } // namespace tidy +/// } // namespace clang /// \endcode class IncludeInserter { public: - IncludeInserter(const SourceManager &SourceMgr, const LangOptions &LangOpts, - IncludeSorter::IncludeStyle Style); - ~IncludeInserter(); + /// Initializes the IncludeInserter using the IncludeStyle \p Style. + /// In most cases the \p Style will be retrieved from the ClangTidyOptions + /// using \code + /// Options.getLocalOrGlobal("IncludeStyle", ) + /// \endcode + explicit IncludeInserter(IncludeSorter::IncludeStyle Style); + + /// Registers this with the Preprocessor \p PP, must be called before this + /// class is used. + void registerPreprocessor(Preprocessor *PP); - /// Create ``PPCallbacks`` for registration with the compiler's preprocessor. - std::unique_ptr CreatePPCallbacks(); + /// Creates a \p Header inclusion directive fixit in the File \p FileID. + /// Returns ``llvm::None`` on error or if the inclusion directive already + /// exists. + llvm::Optional + createIncludeInsertion(FileID FileID, llvm::StringRef Header, bool IsAngled); - /// Creates a \p Header inclusion directive fixit. Returns ``llvm::None`` on - /// error or if inclusion directive already exists. + /// Creates a \p Header inclusion directive fixit in the main file. + /// Returns``llvm::None`` on error or if the inclusion directive already + /// exists. llvm::Optional - CreateIncludeInsertion(FileID FileID, llvm::StringRef Header, bool IsAngled); + createMainFileIncludeInsertion(llvm::StringRef Header, bool IsAngled); + + IncludeSorter::IncludeStyle getStyle() const { return Style; } private: - void AddInclude(StringRef FileName, bool IsAngled, + void addInclude(StringRef FileName, bool IsAngled, SourceLocation HashLocation, SourceLocation EndLocation); IncludeSorter &getOrCreate(FileID FileID); llvm::DenseMap> IncludeSorterByFile; - llvm::DenseMap> InsertedHeaders; - const SourceManager &SourceMgr; + llvm::DenseMap> InsertedHeaders; + const SourceManager *SourceMgr{nullptr}; const IncludeSorter::IncludeStyle Style; friend class IncludeInserterCallback; }; diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp index 040378d980f1a..2d67ca4a16180 100644 --- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp @@ -157,6 +157,9 @@ void RenamerClangTidyCheck::addUsage( RenamerClangTidyCheck::NamingCheckFailure &Failure = NamingCheckFailures[Decl]; + if (!Failure.RawUsageLocs.insert(FixLocation.getRawEncoding()).second) + return; + if (!Failure.ShouldFix()) return; @@ -165,8 +168,6 @@ void RenamerClangTidyCheck::addUsage( if (!utils::rangeCanBeFixed(Range, SourceMgr)) Failure.FixStatus = RenamerClangTidyCheck::ShouldFixStatus::InsideMacro; - - Failure.RawUsageLocs.insert(FixLocation.getRawEncoding()); } void RenamerClangTidyCheck::addUsage(const NamedDecl *Decl, SourceRange Range, diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp index 03af5dd1565f8..2c116b210d050 100644 --- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp @@ -32,8 +32,8 @@ TransformerClangTidyCheck::TransformerClangTidyCheck( MakeRule, StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), Rule(MakeRule(getLangOpts(), Options)), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - IncludeSorter::IS_LLVM)) { + Inserter( + Options.getLocalOrGlobal("IncludeStyle", IncludeSorter::IS_LLVM)) { if (Rule) assert(llvm::all_of(Rule->Cases, hasExplanation) && "clang-tidy checks must have an explanation by default;" @@ -44,8 +44,8 @@ TransformerClangTidyCheck::TransformerClangTidyCheck(RewriteRule R, StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), Rule(std::move(R)), - IncludeStyle(Options.getLocalOrGlobal("IncludeStyle", - IncludeSorter::IS_LLVM)) { + Inserter( + Options.getLocalOrGlobal("IncludeStyle", IncludeSorter::IS_LLVM)) { assert(llvm::all_of(Rule->Cases, hasExplanation) && "clang-tidy checks must have an explanation by default;" " explicitly provide an empty explanation if none is desired"); @@ -53,15 +53,12 @@ TransformerClangTidyCheck::TransformerClangTidyCheck(RewriteRule R, void TransformerClangTidyCheck::registerPPCallbacks( const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) { - // Only allocate and register the IncludeInsert when some `Case` will add + // Only register the IncludeInsert when some `Case` will add // includes. if (Rule && llvm::any_of(Rule->Cases, [](const RewriteRule::Case &C) { return !C.AddedIncludes.empty(); - })) { - Inserter = - std::make_unique(SM, getLangOpts(), IncludeStyle); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); - } + })) + Inserter.registerPreprocessor(PP); } void TransformerClangTidyCheck::registerMatchers( @@ -102,15 +99,15 @@ void TransformerClangTidyCheck::check( Diag << FixItHint::CreateReplacement(T.Range, T.Replacement); for (const auto &I : Case.AddedIncludes) { - Diag << Inserter->CreateIncludeInsertion( - Result.SourceManager->getMainFileID(), I.first, + Diag << Inserter.createMainFileIncludeInsertion( + I.first, /*IsAngled=*/I.second == transformer::IncludeFormat::Angled); } } void TransformerClangTidyCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { - Options.store(Opts, "IncludeStyle", IncludeStyle); + Options.store(Opts, "IncludeStyle", Inserter.getStyle()); } } // namespace utils diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h index 829a22fe8e2cc..404f474a24cae 100644 --- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h +++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.h @@ -70,8 +70,7 @@ class TransformerClangTidyCheck : public ClangTidyCheck { private: Optional Rule; - const IncludeSorter::IncludeStyle IncludeStyle; - std::unique_ptr Inserter; + IncludeInserter Inserter; }; } // namespace utils diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index 8db6656e5291a..639441e8130ab 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -15,6 +15,7 @@ if (NOT DEFINED CLANGD_BUILD_XPC) endif () llvm_canonicalize_cmake_booleans(CLANGD_BUILD_XPC) +llvm_canonicalize_cmake_booleans(CLANGD_ENABLE_REMOTE) configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/Features.inc.in diff --git a/clang-tools-extra/clangd/Features.inc.in b/clang-tools-extra/clangd/Features.inc.in index da75aa67a65b8..6797232ddac7c 100644 --- a/clang-tools-extra/clangd/Features.inc.in +++ b/clang-tools-extra/clangd/Features.inc.in @@ -1 +1,2 @@ #define CLANGD_BUILD_XPC @CLANGD_BUILD_XPC@ +#define CLANGD_ENABLE_REMOTE @CLANGD_ENABLE_REMOTE@ diff --git a/clang-tools-extra/clangd/FindSymbols.cpp b/clang-tools-extra/clangd/FindSymbols.cpp index f5d6a95aa713d..2471656988250 100644 --- a/clang-tools-extra/clangd/FindSymbols.cpp +++ b/clang-tools-extra/clangd/FindSymbols.cpp @@ -188,7 +188,7 @@ class DocumentOutline { } private: - enum class VisitKind { No, OnlyDecl, DeclAndChildren }; + enum class VisitKind { No, OnlyDecl, OnlyChildren, DeclAndChildren }; void traverseDecl(Decl *D, std::vector &Results) { if (auto *Templ = llvm::dyn_cast(D)) { @@ -196,18 +196,25 @@ class DocumentOutline { if (auto *TD = Templ->getTemplatedDecl()) D = TD; } - auto *ND = llvm::dyn_cast(D); - if (!ND) - return; - VisitKind Visit = shouldVisit(ND); + + VisitKind Visit = shouldVisit(D); if (Visit == VisitKind::No) return; - llvm::Optional Sym = declToSym(AST.getASTContext(), *ND); + + if (Visit == VisitKind::OnlyChildren) + return traverseChildren(D, Results); + + auto *ND = llvm::cast(D); + auto Sym = declToSym(AST.getASTContext(), *ND); if (!Sym) return; - if (Visit == VisitKind::DeclAndChildren) - traverseChildren(D, Sym->children); Results.push_back(std::move(*Sym)); + + if (Visit == VisitKind::OnlyDecl) + return; + + assert(Visit == VisitKind::DeclAndChildren && "Unexpected VisitKind"); + traverseChildren(ND, Results.back().children); } void traverseChildren(Decl *D, std::vector &Results) { @@ -218,10 +225,16 @@ class DocumentOutline { traverseDecl(C, Results); } - VisitKind shouldVisit(NamedDecl *D) { + VisitKind shouldVisit(Decl *D) { if (D->isImplicit()) return VisitKind::No; + if (llvm::isa(D) || llvm::isa(D)) + return VisitKind::OnlyChildren; + + if (!llvm::isa(D)) + return VisitKind::No; + if (auto Func = llvm::dyn_cast(D)) { // Some functions are implicit template instantiations, those should be // ignored. diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp index c6022b2463e85..e4d2dddb4b5d3 100644 --- a/clang-tools-extra/clangd/FindTarget.cpp +++ b/clang-tools-extra/clangd/FindTarget.cpp @@ -163,13 +163,12 @@ const Type *getPointeeType(const Type *T) { } // Forward declaration, needed as this function is mutually recursive -// with resolveDependentExprToDecls. -const Type *resolveDependentExprToType(const Expr *E); +// with resolveExprToDecls. +const Type *resolveExprToType(const Expr *E); -// Try to heuristically resolve a dependent expression `E` to one +// Try to heuristically resolve a possibly-dependent expression `E` to one // or more declarations that it likely references. -std::vector resolveDependentExprToDecls(const Expr *E) { - assert(E->isTypeDependent()); +std::vector resolveExprToDecls(const Expr *E) { if (const auto *ME = dyn_cast(E)) { const Type *BaseType = ME->getBaseType().getTypePtrOrNull(); if (ME->isArrow()) { @@ -183,7 +182,7 @@ std::vector resolveDependentExprToDecls(const Expr *E) { // can get further by analyzing the depedent expression. Expr *Base = ME->isImplicitAccess() ? nullptr : ME->getBase(); if (Base && BT->getKind() == BuiltinType::Dependent) { - BaseType = resolveDependentExprToType(Base); + BaseType = resolveExprToType(Base); } } return getMembersReferencedViaDependentName( @@ -197,7 +196,7 @@ std::vector resolveDependentExprToDecls(const Expr *E) { /*IsNonstaticMember=*/false); } if (const auto *CE = dyn_cast(E)) { - const auto *CalleeType = resolveDependentExprToType(CE->getCallee()); + const auto *CalleeType = resolveExprToType(CE->getCallee()); if (!CalleeType) return {}; if (const auto *FnTypePtr = CalleeType->getAs()) @@ -209,15 +208,16 @@ std::vector resolveDependentExprToDecls(const Expr *E) { } } } - if (const auto *ME = dyn_cast(E)) { + if (const auto *ME = dyn_cast(E)) return {ME->getMemberDecl()}; - } + if (const auto *DRE = dyn_cast(E)) + return {DRE->getFoundDecl()}; return {}; } -// Try to heuristically resolve the type of a dependent expression `E`. -const Type *resolveDependentExprToType(const Expr *E) { - std::vector Decls = resolveDependentExprToDecls(E); +// Try to heuristically resolve the type of a possibly-dependent expression `E`. +const Type *resolveExprToType(const Expr *E) { + std::vector Decls = resolveExprToDecls(E); if (Decls.size() != 1) // Names an overload set -- just bail. return nullptr; if (const auto *TD = dyn_cast(Decls[0])) { @@ -426,12 +426,12 @@ struct TargetFinder { } void VisitCXXDependentScopeMemberExpr(const CXXDependentScopeMemberExpr *E) { - for (const NamedDecl *D : resolveDependentExprToDecls(E)) { + for (const NamedDecl *D : resolveExprToDecls(E)) { Outer.add(D, Flags); } } void VisitDependentScopeDeclRefExpr(const DependentScopeDeclRefExpr *E) { - for (const NamedDecl *D : resolveDependentExprToDecls(E)) { + for (const NamedDecl *D : resolveExprToDecls(E)) { Outer.add(D, Flags); } } @@ -460,6 +460,12 @@ struct TargetFinder { void VisitPseudoObjectExpr(const PseudoObjectExpr *POE) { Outer.add(POE->getSyntacticForm(), Flags); } + void VisitCXXNewExpr(const CXXNewExpr *CNE) { + Outer.add(CNE->getOperatorNew(), Flags); + } + void VisitCXXDeleteExpr(const CXXDeleteExpr *CDE) { + Outer.add(CDE->getOperatorDelete(), Flags); + } }; Visitor(*this, Flags).Visit(S); } diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp index aeff7ebc32a26..26653aa409d7d 100644 --- a/clang-tools-extra/clangd/XRefs.cpp +++ b/clang-tools-extra/clangd/XRefs.cpp @@ -238,6 +238,10 @@ locateASTReferent(SourceLocation CurLoc, const syntax::Token *TouchedIdentifier, llvm::DenseMap ResultIndex; auto AddResultDecl = [&](const NamedDecl *D) { + // FIXME: Canonical declarations of some symbols might refer to built-in + // decls with possibly-invalid source locations (e.g. global new operator). + // In such cases we should pick up a redecl with valid source location + // instead of failing. D = llvm::cast(D->getCanonicalDecl()); auto Loc = makeLocation(AST.getASTContext(), nameLocation(*D, SM), MainFilePath); @@ -405,15 +409,17 @@ locateSymbolTextually(const SpelledWord &Word, ParsedAST &AST, log("locateSymbolNamedTextuallyAt: {0}", MaybeDeclLoc.takeError()); return; } - Location DeclLoc = *MaybeDeclLoc; - Location DefLoc; + LocatedSymbol Located; + Located.PreferredDeclaration = *MaybeDeclLoc; + Located.Name = (Sym.Name + Sym.TemplateSpecializationArgs).str(); if (Sym.Definition) { auto MaybeDefLoc = indexToLSPLocation(Sym.Definition, MainFilePath); if (!MaybeDefLoc) { log("locateSymbolNamedTextuallyAt: {0}", MaybeDefLoc.takeError()); return; } - DefLoc = *MaybeDefLoc; + Located.PreferredDeclaration = *MaybeDefLoc; + Located.Definition = *MaybeDefLoc; } if (ScoredResults.size() >= 3) { @@ -424,11 +430,6 @@ locateSymbolTextually(const SpelledWord &Word, ParsedAST &AST, return; } - LocatedSymbol Located; - Located.Name = (Sym.Name + Sym.TemplateSpecializationArgs).str(); - Located.PreferredDeclaration = bool(Sym.Definition) ? DefLoc : DeclLoc; - Located.Definition = DefLoc; - SymbolQualitySignals Quality; Quality.merge(Sym); SymbolRelevanceSignals Relevance; @@ -518,7 +519,7 @@ const syntax::Token *findNearbyIdentifier(const SpelledWord &Word, // Find where the word occurred in the token stream, to search forward & back. auto *I = llvm::partition_point(SpelledTokens, [&](const syntax::Token &T) { assert(SM.getFileID(T.location()) == SM.getFileID(Word.Location)); - return T.location() >= Word.Location; // Comparison OK: same file. + return T.location() < Word.Location; // Comparison OK: same file. }); // Search for matches after the cursor. for (const syntax::Token &Tok : llvm::makeArrayRef(I, SpelledTokens.end())) diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp index 6c11399c87b68..c163951aff9ba 100644 --- a/clang-tools-extra/clangd/index/SymbolCollector.cpp +++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp @@ -314,7 +314,8 @@ bool SymbolCollector::handleDeclOccurrence( // file locations for references (as it aligns the behavior of clangd's // AST-based xref). // FIXME: we should try to use the file locations for other fields. - if (CollectRef && !IsMainFileOnly && !isa(ND) && + if (CollectRef && (!IsMainFileOnly || ND->isExternallyVisible()) && + !isa(ND) && (Opts.RefsInHeaders || SM.getFileID(SM.getFileLoc(Loc)) == SM.getMainFileID())) DeclRefs[ND].emplace_back(SM.getFileLoc(Loc), Roles); diff --git a/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp b/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp index 80d87aa3f9f51..ca35f620bba18 100644 --- a/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp +++ b/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp @@ -283,7 +283,7 @@ class Export : public Command { }; public: - void run() { + void run() override { using namespace clang::clangd; // Read input file (as specified in global option) auto Buffer = llvm::MemoryBuffer::getFile(IndexLocation); diff --git a/clang-tools-extra/clangd/index/remote/Client.cpp b/clang-tools-extra/clangd/index/remote/Client.cpp index 35ce84068f406..4c1741e715a55 100644 --- a/clang-tools-extra/clangd/index/remote/Client.cpp +++ b/clang-tools-extra/clangd/index/remote/Client.cpp @@ -16,6 +16,7 @@ #include "support/Logger.h" #include "support/Trace.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" #include @@ -37,12 +38,15 @@ class IndexClient : public clangd::SymbolIndex { bool FinalResult = false; trace::Span Tracer(RequestT::descriptor()->name()); const auto RPCRequest = ProtobufMarshaller->toProtobuf(Request); + SPAN_ATTACH(Tracer, "Request", RPCRequest.DebugString()); grpc::ClientContext Context; std::chrono::system_clock::time_point Deadline = std::chrono::system_clock::now() + DeadlineWaitingTime; Context.set_deadline(Deadline); auto Reader = (Stub.get()->*RPCCall)(&Context, RPCRequest); ReplyT Reply; + unsigned Successful = 0; + unsigned FailedToParse = 0; while (Reader->Read(&Reply)) { if (!Reply.has_stream_result()) { FinalResult = Reply.final_result(); @@ -50,12 +54,18 @@ class IndexClient : public clangd::SymbolIndex { } auto Response = ProtobufMarshaller->fromProtobuf(Reply.stream_result()); if (!Response) { - elog("Received invalid {0}", ReplyT::descriptor()->name()); + elog("Received invalid {0}: {1}. Reason: {2}", + ReplyT::descriptor()->name(), Reply.stream_result().DebugString(), + Response.takeError()); + ++FailedToParse; continue; } Callback(*Response); + ++Successful; } - SPAN_ATTACH(Tracer, "status", Reader->Finish().ok()); + SPAN_ATTACH(Tracer, "Status", Reader->Finish().ok()); + SPAN_ATTACH(Tracer, "Successful", Successful); + SPAN_ATTACH(Tracer, "Failed to parse", FailedToParse); return FinalResult; } @@ -84,11 +94,16 @@ class IndexClient : public clangd::SymbolIndex { return streamRPC(Request, &remote::SymbolIndex::Stub::Refs, Callback); } - // FIXME(kirillbobyrev): Implement this. void - relations(const clangd::RelationsRequest &, - llvm::function_ref) - const {} + relations(const clangd::RelationsRequest &Request, + llvm::function_ref + Callback) const { + streamRPC(Request, &remote::SymbolIndex::Stub::Relations, + // Unpack protobuf Relation. + [&](std::pair SubjectAndObject) { + Callback(SubjectAndObject.first, SubjectAndObject.second); + }); + } // IndexClient does not take any space since the data is stored on the // server. diff --git a/clang-tools-extra/clangd/index/remote/Index.proto b/clang-tools-extra/clangd/index/remote/Index.proto index 99c4e3329d670..305164ffef779 100644 --- a/clang-tools-extra/clangd/index/remote/Index.proto +++ b/clang-tools-extra/clangd/index/remote/Index.proto @@ -18,6 +18,8 @@ service SymbolIndex { rpc FuzzyFind(FuzzyFindRequest) returns (stream FuzzyFindReply) {} rpc Refs(RefsRequest) returns (stream RefsReply) {} + + rpc Relations(RelationsRequest) returns (stream RelationsReply) {} } message LookupRequest { repeated string ids = 1; } @@ -114,3 +116,25 @@ message HeaderWithReferences { string header = 1; uint32 references = 2; } + +message RelationsRequest { + repeated string subjects = 1; + uint32 predicate = 2; + uint32 limit = 3; +} + +// The response is a stream of reference messages, and one terminating has_more +// message. +message RelationsReply { + oneof kind { + Relation stream_result = 1; + bool final_result = 2; // HasMore + } +} + +// This struct does not mirror clangd::Relation but rather the arguments of +// SymbolIndex::relations callback. +message Relation { + string subject_id = 1; + Symbol object = 2; +} diff --git a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp index b6c83c9740727..bac5b7e6e958e 100644 --- a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp +++ b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp @@ -10,6 +10,7 @@ #include "Headers.h" #include "Index.pb.h" #include "Protocol.h" +#include "index/Index.h" #include "index/Serialization.h" #include "index/Symbol.h" #include "index/SymbolID.h" @@ -17,8 +18,7 @@ #include "index/SymbolOrigin.h" #include "support/Logger.h" #include "clang/Index/IndexSymbol.h" -#include "llvm/ADT/None.h" -#include "llvm/ADT/Optional.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -30,6 +30,27 @@ namespace clang { namespace clangd { namespace remote { +namespace { + +template +llvm::Expected> getIDs(IDRange IDs) { + llvm::DenseSet Result; + for (const auto &ID : IDs) { + auto SID = SymbolID::fromStr(StringRef(ID)); + if (!SID) + return SID.takeError(); + Result.insert(*SID); + } + return Result; +} + +llvm::Error makeStringError(llvm::StringRef Message) { + return llvm::make_error(Message, + llvm::inconvertibleErrorCode()); +} + +} // namespace + Marshaller::Marshaller(llvm::StringRef RemoteIndexRoot, llvm::StringRef LocalIndexRoot) : Strings(Arena) { @@ -49,41 +70,70 @@ Marshaller::Marshaller(llvm::StringRef RemoteIndexRoot, assert(!RemoteIndexRoot.empty() || !LocalIndexRoot.empty()); } -clangd::FuzzyFindRequest -Marshaller::fromProtobuf(const FuzzyFindRequest *Request) { +llvm::Expected +Marshaller::fromProtobuf(const LookupRequest *Message) { + clangd::LookupRequest Req; + auto IDs = getIDs(Message->ids()); + if (!IDs) + return IDs.takeError(); + Req.IDs = std::move(*IDs); + return Req; +} + +llvm::Expected +Marshaller::fromProtobuf(const FuzzyFindRequest *Message) { assert(RemoteIndexRoot); clangd::FuzzyFindRequest Result; - Result.Query = Request->query(); - for (const auto &Scope : Request->scopes()) + Result.Query = Message->query(); + for (const auto &Scope : Message->scopes()) Result.Scopes.push_back(Scope); - Result.AnyScope = Request->any_scope(); - if (Request->limit()) - Result.Limit = Request->limit(); - Result.RestrictForCodeCompletion = Request->restricted_for_code_completion(); - for (const auto &Path : Request->proximity_paths()) { + Result.AnyScope = Message->any_scope(); + if (Message->limit()) + Result.Limit = Message->limit(); + Result.RestrictForCodeCompletion = Message->restricted_for_code_completion(); + for (const auto &Path : Message->proximity_paths()) { llvm::SmallString<256> LocalPath = llvm::StringRef(*RemoteIndexRoot); llvm::sys::path::append(LocalPath, Path); Result.ProximityPaths.push_back(std::string(LocalPath)); } - for (const auto &Type : Request->preferred_types()) + for (const auto &Type : Message->preferred_types()) Result.ProximityPaths.push_back(Type); return Result; } -llvm::Optional Marshaller::fromProtobuf(const Symbol &Message) { - if (!Message.has_info() || !Message.has_canonical_declaration()) { - elog("Cannot convert Symbol from protobuf (missing info, definition or " - "declaration): {0}", - Message.DebugString()); - return llvm::None; - } +llvm::Expected +Marshaller::fromProtobuf(const RefsRequest *Message) { + clangd::RefsRequest Req; + auto IDs = getIDs(Message->ids()); + if (!IDs) + return IDs.takeError(); + Req.IDs = std::move(*IDs); + Req.Filter = static_cast(Message->filter()); + if (Message->limit()) + Req.Limit = Message->limit(); + return Req; +} + +llvm::Expected +Marshaller::fromProtobuf(const RelationsRequest *Message) { + clangd::RelationsRequest Req; + auto IDs = getIDs(Message->subjects()); + if (!IDs) + return IDs.takeError(); + Req.Subjects = std::move(*IDs); + Req.Predicate = static_cast(Message->predicate()); + if (Message->limit()) + Req.Limit = Message->limit(); + return Req; +} + +llvm::Expected Marshaller::fromProtobuf(const Symbol &Message) { + if (!Message.has_info() || !Message.has_canonical_declaration()) + return makeStringError("Missing info or declaration."); clangd::Symbol Result; auto ID = SymbolID::fromStr(Message.id()); - if (!ID) { - elog("Cannot parse SymbolID {0} given protobuf: {1}", ID.takeError(), - Message.DebugString()); - return llvm::None; - } + if (!ID) + return ID.takeError(); Result.ID = *ID; Result.SymInfo = fromProtobuf(Message.info()); Result.Name = Message.name(); @@ -94,11 +144,8 @@ llvm::Optional Marshaller::fromProtobuf(const Symbol &Message) { Result.Definition = *Definition; } auto Declaration = fromProtobuf(Message.canonical_declaration()); - if (!Declaration) { - elog("Cannot convert Symbol from protobuf (invalid declaration): {0}", - Message.DebugString()); - return llvm::None; - } + if (!Declaration) + return Declaration.takeError(); Result.CanonicalDeclaration = *Declaration; Result.References = Message.references(); Result.Origin = static_cast(Message.origin()); @@ -110,34 +157,39 @@ llvm::Optional Marshaller::fromProtobuf(const Symbol &Message) { Result.Type = Message.type(); for (const auto &Header : Message.headers()) { auto SerializedHeader = fromProtobuf(Header); - if (SerializedHeader) - Result.IncludeHeaders.push_back(*SerializedHeader); - else - elog("Cannot convert HeaderWithIncludes from protobuf: {0}", - Header.DebugString()); + if (!SerializedHeader) + return SerializedHeader.takeError(); + Result.IncludeHeaders.push_back(*SerializedHeader); } Result.Flags = static_cast(Message.flags()); return Result; } -llvm::Optional Marshaller::fromProtobuf(const Ref &Message) { - if (!Message.has_location()) { - elog("Cannot convert Ref from protobuf (missing location): {0}", - Message.DebugString()); - return llvm::None; - } +llvm::Expected Marshaller::fromProtobuf(const Ref &Message) { + if (!Message.has_location()) + return makeStringError("Missing location."); clangd::Ref Result; auto Location = fromProtobuf(Message.location()); - if (!Location) { - elog("Cannot convert Ref from protobuf (invalid location): {0}", - Message.DebugString()); - return llvm::None; - } + if (!Location) + return Location.takeError(); Result.Location = *Location; Result.Kind = static_cast(Message.kind()); return Result; } +llvm::Expected> +Marshaller::fromProtobuf(const Relation &Message) { + auto SubjectID = SymbolID::fromStr(Message.subject_id()); + if (!SubjectID) + return SubjectID.takeError(); + if (!Message.has_object()) + return makeStringError("Missing Object."); + auto Object = fromProtobuf(Message.object()); + if (!Object) + return Object.takeError(); + return std::make_pair(*SubjectID, *Object); +} + LookupRequest Marshaller::toProtobuf(const clangd::LookupRequest &From) { LookupRequest RPCRequest; for (const auto &SymbolID : From.IDs) @@ -157,8 +209,7 @@ FuzzyFindRequest Marshaller::toProtobuf(const clangd::FuzzyFindRequest &From) { RPCRequest.set_restricted_for_code_completion(From.RestrictForCodeCompletion); for (const auto &Path : From.ProximityPaths) { llvm::SmallString<256> RelativePath = llvm::StringRef(Path); - if (llvm::sys::path::replace_path_prefix(RelativePath, *LocalIndexRoot, - "")) + if (llvm::sys::path::replace_path_prefix(RelativePath, *LocalIndexRoot, "")) RPCRequest.add_proximity_paths(llvm::sys::path::convert_to_slash( RelativePath, llvm::sys::path::Style::posix)); } @@ -177,27 +228,31 @@ RefsRequest Marshaller::toProtobuf(const clangd::RefsRequest &From) { return RPCRequest; } -llvm::Optional Marshaller::toProtobuf(const clangd::Symbol &From) { +RelationsRequest Marshaller::toProtobuf(const clangd::RelationsRequest &From) { + RelationsRequest RPCRequest; + for (const auto &ID : From.Subjects) + RPCRequest.add_subjects(ID.str()); + RPCRequest.set_predicate(static_cast(From.Predicate)); + if (From.Limit) + RPCRequest.set_limit(*From.Limit); + return RPCRequest; +} + +llvm::Expected Marshaller::toProtobuf(const clangd::Symbol &From) { Symbol Result; Result.set_id(From.ID.str()); *Result.mutable_info() = toProtobuf(From.SymInfo); Result.set_name(From.Name.str()); if (*From.Definition.FileURI) { auto Definition = toProtobuf(From.Definition); - if (!Definition) { - elog("Can not convert Symbol to protobuf (invalid definition) {0}: {1}", - From, From.Definition); - return llvm::None; - } + if (!Definition) + return Definition.takeError(); *Result.mutable_definition() = *Definition; } Result.set_scope(From.Scope.str()); auto Declaration = toProtobuf(From.CanonicalDeclaration); - if (!Declaration) { - elog("Can not convert Symbol to protobuf (invalid declaration) {0}: {1}", - From, From.CanonicalDeclaration); - return llvm::None; - } + if (!Declaration) + return Declaration.takeError(); *Result.mutable_canonical_declaration() = *Declaration; Result.set_references(From.References); Result.set_origin(static_cast(From.Origin)); @@ -210,11 +265,8 @@ llvm::Optional Marshaller::toProtobuf(const clangd::Symbol &From) { Result.set_type(From.Type.str()); for (const auto &Header : From.IncludeHeaders) { auto Serialized = toProtobuf(Header); - if (!Serialized) { - elog("Can not convert IncludeHeaderWithReferences to protobuf: {0}", - Header.IncludeHeader); - continue; - } + if (!Serialized) + return Serialized.takeError(); auto *NextHeader = Result.add_headers(); *NextHeader = *Serialized; } @@ -222,51 +274,52 @@ llvm::Optional Marshaller::toProtobuf(const clangd::Symbol &From) { return Result; } -llvm::Optional Marshaller::toProtobuf(const clangd::Ref &From) { +llvm::Expected Marshaller::toProtobuf(const clangd::Ref &From) { Ref Result; Result.set_kind(static_cast(From.Kind)); auto Location = toProtobuf(From.Location); - if (!Location) { - elog("Can not convert Reference to protobuf (invalid location) {0}: {1}", - From, From.Location); - return llvm::None; - } + if (!Location) + return Location.takeError(); *Result.mutable_location() = *Location; return Result; } -llvm::Optional +llvm::Expected Marshaller::toProtobuf(const clangd::SymbolID &Subject, + const clangd::Symbol &Object) { + Relation Result; + *Result.mutable_subject_id() = Subject.str(); + auto SerializedObject = toProtobuf(Object); + if (!SerializedObject) + return SerializedObject.takeError(); + *Result.mutable_object() = *SerializedObject; + return Result; +} + +llvm::Expected Marshaller::relativePathToURI(llvm::StringRef RelativePath) { assert(LocalIndexRoot); assert(RelativePath == llvm::sys::path::convert_to_slash( RelativePath, llvm::sys::path::Style::posix)); - if (RelativePath.empty()) { - return llvm::None; - } - if (llvm::sys::path::is_absolute(RelativePath)) { - return llvm::None; - } + if (RelativePath.empty()) + return makeStringError("Empty relative path."); + if (llvm::sys::path::is_absolute(RelativePath)) + return makeStringError("RelativePath is absolute."); llvm::SmallString<256> FullPath = llvm::StringRef(*LocalIndexRoot); llvm::sys::path::append(FullPath, RelativePath); auto Result = URI::createFile(FullPath); return Result.toString(); } -llvm::Optional Marshaller::uriToRelativePath(llvm::StringRef URI) { +llvm::Expected Marshaller::uriToRelativePath(llvm::StringRef URI) { assert(RemoteIndexRoot); auto ParsedURI = URI::parse(URI); - if (!ParsedURI) { - elog("Remote index got bad URI from client {0}: {1}", URI, - ParsedURI.takeError()); - return llvm::None; - } - if (ParsedURI->scheme() != "file") { - return llvm::None; - } + if (!ParsedURI) + return ParsedURI.takeError(); + if (ParsedURI->scheme() != "file") + return makeStringError("Can not use URI schemes other than file."); llvm::SmallString<256> Result = ParsedURI->body(); - if (!llvm::sys::path::replace_path_prefix(Result, *RemoteIndexRoot, "")) { - return llvm::None; - } + if (!llvm::sys::path::replace_path_prefix(Result, *RemoteIndexRoot, "")) + return makeStringError("File path doesn't start with RemoteIndexRoot."); // Make sure the result has UNIX slashes. return llvm::sys::path::convert_to_slash(Result, llvm::sys::path::Style::posix); @@ -307,31 +360,31 @@ SymbolInfo Marshaller::toProtobuf(const clang::index::SymbolInfo &Info) { return Result; } -llvm::Optional +llvm::Expected Marshaller::fromProtobuf(const SymbolLocation &Message) { clangd::SymbolLocation Location; auto URIString = relativePathToURI(Message.file_path()); if (!URIString) - return llvm::None; + return URIString.takeError(); Location.FileURI = Strings.save(*URIString).begin(); Location.Start = fromProtobuf(Message.start()); Location.End = fromProtobuf(Message.end()); return Location; } -llvm::Optional +llvm::Expected Marshaller::toProtobuf(const clangd::SymbolLocation &Location) { remote::SymbolLocation Result; auto RelativePath = uriToRelativePath(Location.FileURI); if (!RelativePath) - return llvm::None; + return RelativePath.takeError(); *Result.mutable_file_path() = *RelativePath; *Result.mutable_start() = toProtobuf(Location.Start); *Result.mutable_end() = toProtobuf(Location.End); return Result; } -llvm::Optional Marshaller::toProtobuf( +llvm::Expected Marshaller::toProtobuf( const clangd::Symbol::IncludeHeaderWithReferences &IncludeHeader) { HeaderWithReferences Result; Result.set_references(IncludeHeader.References); @@ -342,18 +395,18 @@ llvm::Optional Marshaller::toProtobuf( } auto RelativePath = uriToRelativePath(Header); if (!RelativePath) - return llvm::None; + return RelativePath.takeError(); Result.set_header(*RelativePath); return Result; } -llvm::Optional +llvm::Expected Marshaller::fromProtobuf(const HeaderWithReferences &Message) { std::string Header = Message.header(); - if (Header.front() != '<' && Header.front() != '"') { + if (!isLiteralInclude(Header)) { auto URIString = relativePathToURI(Header); if (!URIString) - return llvm::None; + return URIString.takeError(); Header = *URIString; } return clangd::Symbol::IncludeHeaderWithReferences{Strings.save(Header), diff --git a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h index 9129cff24db57..18ce6074264c0 100644 --- a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h +++ b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h @@ -38,9 +38,18 @@ class Marshaller { Marshaller() = delete; Marshaller(llvm::StringRef RemoteIndexRoot, llvm::StringRef LocalIndexRoot); - clangd::FuzzyFindRequest fromProtobuf(const FuzzyFindRequest *Request); - llvm::Optional fromProtobuf(const Symbol &Message); - llvm::Optional fromProtobuf(const Ref &Message); + llvm::Expected fromProtobuf(const Symbol &Message); + llvm::Expected fromProtobuf(const Ref &Message); + llvm::Expected> + fromProtobuf(const Relation &Message); + + llvm::Expected + fromProtobuf(const LookupRequest *Message); + llvm::Expected + fromProtobuf(const FuzzyFindRequest *Message); + llvm::Expected fromProtobuf(const RefsRequest *Message); + llvm::Expected + fromProtobuf(const RelationsRequest *Message); /// toProtobuf() functions serialize native clangd types and strip IndexRoot /// from the file paths specific to indexing machine. fromProtobuf() functions @@ -49,9 +58,12 @@ class Marshaller { LookupRequest toProtobuf(const clangd::LookupRequest &From); FuzzyFindRequest toProtobuf(const clangd::FuzzyFindRequest &From); RefsRequest toProtobuf(const clangd::RefsRequest &From); + RelationsRequest toProtobuf(const clangd::RelationsRequest &From); - llvm::Optional toProtobuf(const clangd::Ref &From); - llvm::Optional toProtobuf(const clangd::Symbol &From); + llvm::Expected toProtobuf(const clangd::Symbol &From); + llvm::Expected toProtobuf(const clangd::Ref &From); + llvm::Expected toProtobuf(const clangd::SymbolID &Subject, + const clangd::Symbol &Object); /// Translates \p RelativePath into the absolute path and builds URI for the /// user machine. This translation happens on the client side with the @@ -59,23 +71,23 @@ class Marshaller { /// provided by the client. /// /// The relative path passed over the wire has unix slashes. - llvm::Optional relativePathToURI(llvm::StringRef RelativePath); + llvm::Expected relativePathToURI(llvm::StringRef RelativePath); /// Translates a URI from the server's backing index to a relative path /// suitable to send over the wire to the client. - llvm::Optional uriToRelativePath(llvm::StringRef URI); + llvm::Expected uriToRelativePath(llvm::StringRef URI); private: clangd::SymbolLocation::Position fromProtobuf(const Position &Message); Position toProtobuf(const clangd::SymbolLocation::Position &Position); clang::index::SymbolInfo fromProtobuf(const SymbolInfo &Message); SymbolInfo toProtobuf(const clang::index::SymbolInfo &Info); - llvm::Optional + llvm::Expected fromProtobuf(const SymbolLocation &Message); - llvm::Optional + llvm::Expected toProtobuf(const clangd::SymbolLocation &Location); - llvm::Optional + llvm::Expected toProtobuf(const clangd::Symbol::IncludeHeaderWithReferences &IncludeHeader); - llvm::Optional + llvm::Expected fromProtobuf(const HeaderWithReferences &Message); /// RemoteIndexRoot and LocalIndexRoot are absolute paths to the project (on diff --git a/clang-tools-extra/clangd/index/remote/server/Server.cpp b/clang-tools-extra/clangd/index/remote/server/Server.cpp index 07b1c736b6725..e9838cce85e3d 100644 --- a/clang-tools-extra/clangd/index/remote/server/Server.cpp +++ b/clang-tools-extra/clangd/index/remote/server/Server.cpp @@ -6,11 +6,16 @@ // //===----------------------------------------------------------------------===// +#include "Index.pb.h" #include "index/Index.h" #include "index/Serialization.h" +#include "index/Symbol.h" #include "index/remote/marshalling/Marshalling.h" +#include "support/Logger.h" +#include "support/Trace.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" #include "llvm/Support/Path.h" #include "llvm/Support/Signals.h" @@ -35,6 +40,25 @@ llvm::cl::opt IndexPath(llvm::cl::desc(""), llvm::cl::opt IndexRoot(llvm::cl::desc(""), llvm::cl::Positional, llvm::cl::Required); +llvm::cl::opt LogLevel{ + "log", + llvm::cl::desc("Verbosity of log messages written to stderr"), + values(clEnumValN(Logger::Error, "error", "Error messages only"), + clEnumValN(Logger::Info, "info", "High level execution tracing"), + clEnumValN(Logger::Debug, "verbose", "Low level details")), + llvm::cl::init(Logger::Info), +}; + +llvm::cl::opt TraceFile( + "trace-file", + llvm::cl::desc("Path to the file where tracer logs will be stored")); + +llvm::cl::opt PrettyPrint{ + "pretty", + llvm::cl::desc("Pretty-print JSON output in the trace"), + llvm::cl::init(false), +}; + llvm::cl::opt ServerAddress( "server-address", llvm::cl::init("0.0.0.0:50051"), llvm::cl::desc("Address of the invoked server. Defaults to 0.0.0.0:50051")); @@ -59,65 +83,130 @@ class RemoteIndexServer final : public SymbolIndex::Service { grpc::Status Lookup(grpc::ServerContext *Context, const LookupRequest *Request, grpc::ServerWriter *Reply) override { - clangd::LookupRequest Req; - for (const auto &ID : Request->ids()) { - auto SID = SymbolID::fromStr(StringRef(ID)); - if (!SID) - return grpc::Status::CANCELLED; - Req.IDs.insert(*SID); + trace::Span Tracer("LookupRequest"); + auto Req = ProtobufMarshaller->fromProtobuf(Request); + if (!Req) { + elog("Can not parse LookupRequest from protobuf: {0}", Req.takeError()); + return grpc::Status::CANCELLED; } - Index->lookup(Req, [&](const clangd::Symbol &Sym) { - auto SerializedSymbol = ProtobufMarshaller->toProtobuf(Sym); - if (!SerializedSymbol) + unsigned Sent = 0; + unsigned FailedToSend = 0; + Index->lookup(*Req, [&](const clangd::Symbol &Item) { + auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); + if (!SerializedItem) { + elog("Unable to convert Symbol to protobuf: {0}", + SerializedItem.takeError()); + ++FailedToSend; return; + } LookupReply NextMessage; - *NextMessage.mutable_stream_result() = *SerializedSymbol; + *NextMessage.mutable_stream_result() = *SerializedItem; Reply->Write(NextMessage); + ++Sent; }); LookupReply LastMessage; LastMessage.set_final_result(true); Reply->Write(LastMessage); + SPAN_ATTACH(Tracer, "Sent", Sent); + SPAN_ATTACH(Tracer, "Failed to send", FailedToSend); return grpc::Status::OK; } grpc::Status FuzzyFind(grpc::ServerContext *Context, const FuzzyFindRequest *Request, grpc::ServerWriter *Reply) override { - const auto Req = ProtobufMarshaller->fromProtobuf(Request); - bool HasMore = Index->fuzzyFind(Req, [&](const clangd::Symbol &Sym) { - auto SerializedSymbol = ProtobufMarshaller->toProtobuf(Sym); - if (!SerializedSymbol) + trace::Span Tracer("FuzzyFindRequest"); + auto Req = ProtobufMarshaller->fromProtobuf(Request); + if (!Req) { + elog("Can not parse FuzzyFindRequest from protobuf: {0}", + Req.takeError()); + return grpc::Status::CANCELLED; + } + unsigned Sent = 0; + unsigned FailedToSend = 0; + bool HasMore = Index->fuzzyFind(*Req, [&](const clangd::Symbol &Item) { + auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); + if (!SerializedItem) { + elog("Unable to convert Symbol to protobuf: {0}", + SerializedItem.takeError()); + ++FailedToSend; return; + } FuzzyFindReply NextMessage; - *NextMessage.mutable_stream_result() = *SerializedSymbol; + *NextMessage.mutable_stream_result() = *SerializedItem; Reply->Write(NextMessage); + ++Sent; }); FuzzyFindReply LastMessage; LastMessage.set_final_result(HasMore); Reply->Write(LastMessage); + SPAN_ATTACH(Tracer, "Sent", Sent); + SPAN_ATTACH(Tracer, "Failed to send", FailedToSend); return grpc::Status::OK; } grpc::Status Refs(grpc::ServerContext *Context, const RefsRequest *Request, grpc::ServerWriter *Reply) override { - clangd::RefsRequest Req; - for (const auto &ID : Request->ids()) { - auto SID = SymbolID::fromStr(StringRef(ID)); - if (!SID) - return grpc::Status::CANCELLED; - Req.IDs.insert(*SID); + trace::Span Tracer("RefsRequest"); + auto Req = ProtobufMarshaller->fromProtobuf(Request); + if (!Req) { + elog("Can not parse RefsRequest from protobuf: {0}", Req.takeError()); + return grpc::Status::CANCELLED; } - bool HasMore = Index->refs(Req, [&](const clangd::Ref &Reference) { - auto SerializedRef = ProtobufMarshaller->toProtobuf(Reference); - if (!SerializedRef) + unsigned Sent = 0; + unsigned FailedToSend = 0; + bool HasMore = Index->refs(*Req, [&](const clangd::Ref &Item) { + auto SerializedItem = ProtobufMarshaller->toProtobuf(Item); + if (!SerializedItem) { + elog("Unable to convert Ref to protobuf: {0}", + SerializedItem.takeError()); + ++FailedToSend; return; + } RefsReply NextMessage; - *NextMessage.mutable_stream_result() = *SerializedRef; + *NextMessage.mutable_stream_result() = *SerializedItem; Reply->Write(NextMessage); + ++Sent; }); RefsReply LastMessage; LastMessage.set_final_result(HasMore); Reply->Write(LastMessage); + SPAN_ATTACH(Tracer, "Sent", Sent); + SPAN_ATTACH(Tracer, "Failed to send", FailedToSend); + return grpc::Status::OK; + } + + grpc::Status Relations(grpc::ServerContext *Context, + const RelationsRequest *Request, + grpc::ServerWriter *Reply) override { + trace::Span Tracer("RelationsRequest"); + auto Req = ProtobufMarshaller->fromProtobuf(Request); + if (!Req) { + elog("Can not parse RelationsRequest from protobuf: {0}", + Req.takeError()); + return grpc::Status::CANCELLED; + } + unsigned Sent = 0; + unsigned FailedToSend = 0; + Index->relations( + *Req, [&](const SymbolID &Subject, const clangd::Symbol &Object) { + auto SerializedItem = ProtobufMarshaller->toProtobuf(Subject, Object); + if (!SerializedItem) { + elog("Unable to convert Relation to protobuf: {0}", + SerializedItem.takeError()); + ++FailedToSend; + return; + } + RelationsReply NextMessage; + *NextMessage.mutable_stream_result() = *SerializedItem; + Reply->Write(NextMessage); + ++Sent; + }); + RelationsReply LastMessage; + LastMessage.set_final_result(true); + Reply->Write(LastMessage); + SPAN_ATTACH(Tracer, "Sent", Sent); + SPAN_ATTACH(Tracer, "Failed to send", FailedToSend); return grpc::Status::OK; } @@ -134,7 +223,7 @@ void runServer(std::unique_ptr Index, Builder.AddListeningPort(ServerAddress, grpc::InsecureServerCredentials()); Builder.RegisterService(&Service); std::unique_ptr Server(Builder.BuildAndStart()); - llvm::outs() << "Server listening on " << ServerAddress << '\n'; + log("Server listening on {0}", ServerAddress); Server->Wait(); } @@ -144,6 +233,8 @@ void runServer(std::unique_ptr Index, } // namespace clangd } // namespace clang +using clang::clangd::elog; + int main(int argc, char *argv[]) { using namespace clang::clangd::remote; llvm::cl::ParseCommandLineOptions(argc, argv, Overview); @@ -154,6 +245,34 @@ int main(int argc, char *argv[]) { return -1; } + llvm::errs().SetBuffered(); + // Don't flush stdout when logging for thread safety. + llvm::errs().tie(nullptr); + clang::clangd::StreamLogger Logger(llvm::errs(), LogLevel); + clang::clangd::LoggingSession LoggingSession(Logger); + + llvm::Optional TracerStream; + std::unique_ptr Tracer; + if (!TraceFile.empty()) { + std::error_code EC; + TracerStream.emplace(TraceFile, EC, + llvm::sys::fs::FA_Read | llvm::sys::fs::FA_Write); + if (EC) { + TracerStream.reset(); + elog("Error while opening trace file {0}: {1}", TraceFile, EC.message()); + } else { + // FIXME(kirillbobyrev): Also create metrics tracer to track latency and + // accumulate other request statistics. + Tracer = clang::clangd::trace::createJSONTracer(*TracerStream, + /*PrettyPrint=*/false); + clang::clangd::vlog("Successfully created a tracer."); + } + } + + llvm::Optional TracingSession; + if (Tracer) + TracingSession.emplace(*Tracer); + std::unique_ptr Index = openIndex(IndexPath); if (!Index) { diff --git a/clang-tools-extra/clangd/indexer/CMakeLists.txt b/clang-tools-extra/clangd/indexer/CMakeLists.txt index edbced1410bb8..ff110693c7066 100644 --- a/clang-tools-extra/clangd/indexer/CMakeLists.txt +++ b/clang-tools-extra/clangd/indexer/CMakeLists.txt @@ -20,4 +20,5 @@ clang_target_link_libraries(clangd-indexer target_link_libraries(clangd-indexer PRIVATE clangDaemon + clangdSupport ) diff --git a/clang-tools-extra/clangd/indexer/IndexerMain.cpp b/clang-tools-extra/clangd/indexer/IndexerMain.cpp index dac038308d9e9..46224238c3fca 100644 --- a/clang-tools-extra/clangd/indexer/IndexerMain.cpp +++ b/clang-tools-extra/clangd/indexer/IndexerMain.cpp @@ -16,6 +16,7 @@ #include "index/Serialization.h" #include "index/Symbol.h" #include "index/SymbolCollector.h" +#include "support/Logger.h" #include "clang/Tooling/ArgumentsAdjusters.h" #include "clang/Tooling/CommonOptionsParser.h" #include "clang/Tooling/Execution.h" @@ -122,7 +123,7 @@ int main(int argc, const char **argv) { std::make_unique(Data), clang::tooling::getStripPluginsAdjuster()); if (Err) { - llvm::errs() << llvm::toString(std::move(Err)) << "\n"; + clang::clangd::elog("{0}", std::move(Err)); } // Emit collected data. diff --git a/clang-tools-extra/clangd/tool/CMakeLists.txt b/clang-tools-extra/clangd/tool/CMakeLists.txt index 3368013f50798..670e5a17013ab 100644 --- a/clang-tools-extra/clangd/tool/CMakeLists.txt +++ b/clang-tools-extra/clangd/tool/CMakeLists.txt @@ -27,6 +27,7 @@ clang_target_link_libraries(clangd clangToolingCore clangToolingRefactoring clangToolingSyntax + clangdRemoteIndex ) target_link_libraries(clangd PRIVATE diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp index 7bce1c062e817..daf87d11c3843 100644 --- a/clang-tools-extra/clangd/tool/ClangdMain.cpp +++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -14,6 +14,7 @@ #include "Transport.h" #include "index/Background.h" #include "index/Serialization.h" +#include "index/remote/Client.h" #include "refactor/Rename.h" #include "support/Path.h" #include "support/Shutdown.h" @@ -449,6 +450,21 @@ opt EnableConfig{ init(true), }; +#if CLANGD_ENABLE_REMOTE +opt RemoteIndexAddress{ + "remote-index-address", + cat(Features), + desc("Address of the remote index server"), +}; + +// FIXME(kirillbobyrev): Should this be the location of compile_commands.json? +opt ProjectRoot{ + "project-root", + cat(Features), + desc("Path to the project root. Requires remote-index-address to be set."), +}; +#endif + /// Supports a test URI scheme with relaxed constraints for lit tests. /// The path in a test URI will be combined with a platform-specific fake /// directory to form an absolute path. For example, test:///a.cpp is resolved @@ -641,16 +657,16 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var // continuing. llvm::SmallString<128> Path(CompileCommandsDir); if (std::error_code EC = llvm::sys::fs::make_absolute(Path)) { - llvm::errs() << "Error while converting the relative path specified by " - "--compile-commands-dir to an absolute path: " - << EC.message() << ". The argument will be ignored.\n"; + elog("Error while converting the relative path specified by " + "--compile-commands-dir to an absolute path: {0}. The argument " + "will be ignored.", + EC.message()); } else { CompileCommandsDirPath = std::string(Path.str()); } } else { - llvm::errs() - << "Path specified by --compile-commands-dir does not exist. The " - "argument will be ignored.\n"; + elog("Path specified by --compile-commands-dir does not exist. The " + "argument will be ignored."); } } @@ -680,6 +696,23 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var if (Sync) AsyncIndexLoad.wait(); } +#if CLANGD_ENABLE_REMOTE + if (RemoteIndexAddress.empty() != ProjectRoot.empty()) { + llvm::errs() << "remote-index-address and project-path have to be " + "specified at the same time."; + return 1; + } + if (!RemoteIndexAddress.empty()) { + if (IndexFile.empty()) { + log("Connecting to remote index at {0}", RemoteIndexAddress); + StaticIdx = remote::getClient(RemoteIndexAddress, ProjectRoot); + EnableBackgroundIndex = false; + } else { + elog("When enabling remote index, IndexFile should not be specified. " + "Only one can be used at time. Remote index will ignored."); + } + } +#endif Opts.StaticIndex = StaticIdx.get(); Opts.AsyncThreadsCount = WorkerThreadsCount; Opts.BuildRecoveryAST = RecoveryAST; @@ -717,7 +750,7 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var elog("Couldn't determine user config file, not loading"); } std::vector ProviderPointers; - for (const auto& P : ProviderStack) + for (const auto &P : ProviderStack) ProviderPointers.push_back(P.get()); Config = config::Provider::combine(std::move(ProviderPointers)); Opts.ConfigProvider = Config.get(); diff --git a/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp b/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp index 70d5156b10723..f1c582ef1abe5 100644 --- a/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp +++ b/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp @@ -171,7 +171,7 @@ TEST_F(BackgroundIndexTest, IndexTwoFiles) { #endif )cpp"; FS.Files[testPath("root/A.cc")] = - "#include \"A.h\"\nvoid g() { (void)common; }"; + "#include \"A.h\"\nstatic void g() { (void)common; }"; FS.Files[testPath("root/B.cc")] = R"cpp( #define A 0 diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt index 8ede92c16f7ad..c25e2b7f81037 100644 --- a/clang-tools-extra/clangd/unittests/CMakeLists.txt +++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt @@ -13,10 +13,6 @@ include_directories( ${CLANGD_BINARY_DIR} ) -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - if(CLANG_BUILT_STANDALONE) # LLVMTestingSupport library is needed for clangd tests. if (EXISTS ${LLVM_MAIN_SRC_DIR}/lib/Testing/Support diff --git a/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp b/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp index 07c42fcf20304..8576e11a5f21a 100644 --- a/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp +++ b/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp @@ -429,6 +429,40 @@ TEST(DocumentSymbols, ExternSymbol) { EXPECT_THAT(getSymbols(TU.build()), IsEmpty()); } +TEST(DocumentSymbols, ExternContext) { + TestTU TU; + TU.Code = R"cpp( + extern "C" { + void foo(); + class Foo {}; + } + namespace ns { + extern "C" { + void bar(); + class Bar {}; + } + })cpp"; + + EXPECT_THAT(getSymbols(TU.build()), + ElementsAre(WithName("foo"), WithName("Foo"), + AllOf(WithName("ns"), + Children(WithName("bar"), WithName("Bar"))))); +} + +TEST(DocumentSymbols, ExportContext) { + TestTU TU; + TU.ExtraArgs = {"-std=c++20"}; + TU.Code = R"cpp( + export module test; + export { + void foo(); + class Foo {}; + })cpp"; + + EXPECT_THAT(getSymbols(TU.build()), + ElementsAre(WithName("foo"), WithName("Foo"))); +} + TEST(DocumentSymbols, NoLocals) { TestTU TU; TU.Code = R"cpp( diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp index f7af77e8b57be..3421b9cec2d30 100644 --- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp +++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp @@ -405,6 +405,11 @@ TEST_F(TargetDeclTest, ClassTemplate) { } TEST_F(TargetDeclTest, Concept) { + Flags.push_back("-std=c++20"); + + // FIXME: Should we truncate the pretty-printed form of a concept decl + // somewhere? + Code = R"cpp( template concept Fooable = requires (T t) { t.foo(); }; @@ -414,12 +419,20 @@ TEST_F(TargetDeclTest, Concept) { t.foo(); } )cpp"; - Flags.push_back("-std=c++20"); EXPECT_DECLS( "ConceptSpecializationExpr", - // FIXME: Should we truncate the pretty-printed form of a concept decl - // somewhere? {"template concept Fooable = requires (T t) { t.foo(); };"}); + + // trailing requires clause + Code = R"cpp( + template + concept Fooable = true; + + template + void foo() requires [[Fooable]]; + )cpp"; + EXPECT_DECLS("ConceptSpecializationExpr", + {"template concept Fooable = true;"}); } TEST_F(TargetDeclTest, FunctionTemplate) { @@ -535,6 +548,7 @@ TEST_F(TargetDeclTest, OverloadExpr) { // FIXME: Auto-completion in a template requires disabling delayed template // parsing. Flags = {"-fno-delayed-template-parsing"}; + Flags.push_back("--target=x86_64-pc-linux-gnu"); Code = R"cpp( void func(int*); @@ -559,6 +573,36 @@ TEST_F(TargetDeclTest, OverloadExpr) { }; )cpp"; EXPECT_DECLS("UnresolvedMemberExpr", "void func(int *)", "void func(char *)"); + + Code = R"cpp( + struct X { + static void *operator new(unsigned long); + }; + auto* k = [[new]] X(); + )cpp"; + EXPECT_DECLS("CXXNewExpr", "static void *operator new(unsigned long)"); + Code = R"cpp( + void *operator new(unsigned long); + auto* k = [[new]] int(); + )cpp"; + EXPECT_DECLS("CXXNewExpr", "void *operator new(unsigned long)"); + + Code = R"cpp( + struct X { + static void operator delete(void *) noexcept; + }; + void k(X* x) { + [[delete]] x; + } + )cpp"; + EXPECT_DECLS("CXXDeleteExpr", "static void operator delete(void *) noexcept"); + Code = R"cpp( + void operator delete(void *) noexcept; + void k(int* x) { + [[delete]] x; + } + )cpp"; + EXPECT_DECLS("CXXDeleteExpr", "void operator delete(void *) noexcept"); } TEST_F(TargetDeclTest, DependentExprs) { @@ -627,6 +671,20 @@ TEST_F(TargetDeclTest, DependentExprs) { }; )cpp"; EXPECT_DECLS("CXXDependentScopeMemberExpr", "int aaaa"); + + Code = R"cpp( + class Foo { + public: + static Foo k(int); + template T convert() const; + }; + template + void test() { + Foo::k(T()).template [[convert]](); + } + )cpp"; + EXPECT_DECLS("CXXDependentScopeMemberExpr", + "template T convert() const"); } TEST_F(TargetDeclTest, ObjC) { diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp index 9e4f75b5cca3f..3614ab2c5cb9e 100644 --- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp +++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp @@ -624,11 +624,13 @@ TEST_F(SymbolCollectorTest, Refs) { EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(Symbols, "NS").ID, _)))); EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "MACRO").ID, HaveRanges(Main.ranges("macro"))))); - // Symbols *only* in the main file (a, b, c, FUNC) had no refs collected. + // Symbols *only* in the main file: + // - (a, b) externally visible and should have refs. + // - (c, FUNC) externally invisible and had no refs collected. auto MainSymbols = TestTU::withHeaderCode(SymbolsOnlyInMainCode.code()).headerSymbols(); - EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "a").ID, _)))); - EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "b").ID, _)))); + EXPECT_THAT(Refs, Contains(Pair(findSymbol(MainSymbols, "a").ID, _))); + EXPECT_THAT(Refs, Contains(Pair(findSymbol(MainSymbols, "b").ID, _))); EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "c").ID, _)))); EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "FUNC").ID, _)))); } @@ -816,11 +818,15 @@ TEST_F(SymbolCollectorTest, HeaderAsMainFile) { $Foo[[Foo]] fo; } )"); - // The main file is normal .cpp file, we shouldn't collect any refs of symbols - // which are not declared in the preamble. + // The main file is normal .cpp file, we should collect the refs + // for externally visible symbols. TestFileName = testPath("foo.cpp"); runSymbolCollector("", Header.code()); - EXPECT_THAT(Refs, UnorderedElementsAre()); + EXPECT_THAT(Refs, + UnorderedElementsAre(Pair(findSymbol(Symbols, "Foo").ID, + HaveRanges(Header.ranges("Foo"))), + Pair(findSymbol(Symbols, "Func").ID, + HaveRanges(Header.ranges("Func"))))); // Run the .h file as main file, we should collect the refs. TestFileName = testPath("foo.h"); diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp index 0428303f5b0af..c9c115fd19d83 100644 --- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp +++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp @@ -41,6 +41,7 @@ using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsEmpty; using ::testing::Matcher; +using ::testing::UnorderedElementsAre; using ::testing::UnorderedElementsAreArray; MATCHER_P2(FileRange, File, Range, "") { @@ -264,19 +265,23 @@ MATCHER_P3(Sym, Name, Decl, DefOrNone, "") { << llvm::to_string(arg.PreferredDeclaration); return false; } + if (!Def && !arg.Definition) + return true; if (Def && !arg.Definition) { *result_listener << "Has no definition"; return false; } - if (Def && arg.Definition->range != *Def) { + if (!Def && arg.Definition) { + *result_listener << "Definition is " << llvm::to_string(arg.Definition); + return false; + } + if (arg.Definition->range != *Def) { *result_listener << "Definition is " << llvm::to_string(arg.Definition); return false; } return true; } -::testing::Matcher Sym(std::string Name, Range Decl) { - return Sym(Name, Decl, llvm::None); -} + MATCHER_P(Sym, Name, "") { return arg.Name == Name; } MATCHER_P(RangeIs, R, "") { return arg.range == R; } @@ -771,7 +776,7 @@ TEST(LocateSymbol, TextualSmoke) { auto AST = TU.build(); auto Index = TU.index(); EXPECT_THAT(locateSymbolAt(AST, T.point(), Index.get()), - ElementsAre(Sym("MyClass", T.range()))); + ElementsAre(Sym("MyClass", T.range(), T.range()))); } TEST(LocateSymbol, Textual) { @@ -891,18 +896,20 @@ TEST(LocateSymbol, Ambiguous) { // FIXME: Target the constructor as well. EXPECT_THAT(locateSymbolAt(AST, T.point("9")), ElementsAre(Sym("Foo"))); EXPECT_THAT(locateSymbolAt(AST, T.point("10")), - ElementsAre(Sym("Foo", T.range("ConstructorLoc")))); + ElementsAre(Sym("Foo", T.range("ConstructorLoc"), llvm::None))); EXPECT_THAT(locateSymbolAt(AST, T.point("11")), - ElementsAre(Sym("Foo", T.range("ConstructorLoc")))); + ElementsAre(Sym("Foo", T.range("ConstructorLoc"), llvm::None))); // These assertions are unordered because the order comes from // CXXRecordDecl::lookupDependentName() which doesn't appear to provide // an order guarantee. EXPECT_THAT(locateSymbolAt(AST, T.point("12")), - UnorderedElementsAre(Sym("bar", T.range("NonstaticOverload1")), - Sym("bar", T.range("NonstaticOverload2")))); - EXPECT_THAT(locateSymbolAt(AST, T.point("13")), - UnorderedElementsAre(Sym("baz", T.range("StaticOverload1")), - Sym("baz", T.range("StaticOverload2")))); + UnorderedElementsAre( + Sym("bar", T.range("NonstaticOverload1"), llvm::None), + Sym("bar", T.range("NonstaticOverload2"), llvm::None))); + EXPECT_THAT( + locateSymbolAt(AST, T.point("13")), + UnorderedElementsAre(Sym("baz", T.range("StaticOverload1"), llvm::None), + Sym("baz", T.range("StaticOverload2"), llvm::None))); } TEST(LocateSymbol, TextualDependent) { @@ -932,9 +939,10 @@ TEST(LocateSymbol, TextualDependent) { // interaction between locateASTReferent() and // locateSymbolNamedTextuallyAt(). auto Results = locateSymbolAt(AST, Source.point(), Index.get()); - EXPECT_THAT(Results, UnorderedElementsAre( - Sym("uniqueMethodName", Header.range("FooLoc")), - Sym("uniqueMethodName", Header.range("BarLoc")))); + EXPECT_THAT(Results, + UnorderedElementsAre( + Sym("uniqueMethodName", Header.range("FooLoc"), llvm::None), + Sym("uniqueMethodName", Header.range("BarLoc"), llvm::None))); } TEST(LocateSymbol, TemplateTypedefs) { @@ -992,20 +1000,23 @@ int [[bar_not_preamble]]; auto Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("p1")); EXPECT_TRUE(bool(Locations)) << "findDefinitions returned an error"; - EXPECT_THAT(*Locations, ElementsAre(Sym("foo", SourceAnnotations.range()))); + EXPECT_THAT(*Locations, ElementsAre(Sym("foo", SourceAnnotations.range(), + SourceAnnotations.range()))); // Go to a definition in header_in_preamble.h. Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("p2")); EXPECT_TRUE(bool(Locations)) << "findDefinitions returned an error"; EXPECT_THAT( *Locations, - ElementsAre(Sym("bar_preamble", HeaderInPreambleAnnotations.range()))); + ElementsAre(Sym("bar_preamble", HeaderInPreambleAnnotations.range(), + HeaderInPreambleAnnotations.range()))); // Go to a definition in header_not_in_preamble.h. Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("p3")); EXPECT_TRUE(bool(Locations)) << "findDefinitions returned an error"; EXPECT_THAT(*Locations, ElementsAre(Sym("bar_not_preamble", + HeaderNotInPreambleAnnotations.range(), HeaderNotInPreambleAnnotations.range()))); } @@ -1039,21 +1050,25 @@ TEST(GoToInclude, All) { // Test include in preamble. auto Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point()); ASSERT_TRUE(bool(Locations)) << "locateSymbolAt returned an error"; - EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range()))); + EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range(), + HeaderAnnotations.range()))); // Test include in preamble, last char. Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("2")); ASSERT_TRUE(bool(Locations)) << "locateSymbolAt returned an error"; - EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range()))); + EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range(), + HeaderAnnotations.range()))); Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("3")); ASSERT_TRUE(bool(Locations)) << "locateSymbolAt returned an error"; - EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range()))); + EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range(), + HeaderAnnotations.range()))); // Test include outside of preamble. Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("6")); ASSERT_TRUE(bool(Locations)) << "locateSymbolAt returned an error"; - EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range()))); + EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range(), + HeaderAnnotations.range()))); // Test a few positions that do not result in Locations. Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("4")); @@ -1062,11 +1077,13 @@ TEST(GoToInclude, All) { Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("5")); ASSERT_TRUE(bool(Locations)) << "locateSymbolAt returned an error"; - EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range()))); + EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range(), + HeaderAnnotations.range()))); Locations = runLocateSymbolAt(Server, FooCpp, SourceAnnotations.point("7")); ASSERT_TRUE(bool(Locations)) << "locateSymbolAt returned an error"; - EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range()))); + EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range(), + HeaderAnnotations.range()))); // Objective C #import directive. Annotations ObjC(R"objc( @@ -1078,7 +1095,8 @@ TEST(GoToInclude, All) { Server.addDocument(FooM, ObjC.code()); Locations = runLocateSymbolAt(Server, FooM, ObjC.point()); ASSERT_TRUE(bool(Locations)) << "locateSymbolAt returned an error"; - EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range()))); + EXPECT_THAT(*Locations, ElementsAre(Sym("foo.h", HeaderAnnotations.range(), + HeaderAnnotations.range()))); } TEST(LocateSymbol, WithPreamble) { @@ -1103,7 +1121,7 @@ TEST(LocateSymbol, WithPreamble) { // LocateSymbol goes to a #include file: the result comes from the preamble. EXPECT_THAT( cantFail(runLocateSymbolAt(Server, FooCpp, FooWithHeader.point())), - ElementsAre(Sym("foo.h", FooHeader.range()))); + ElementsAre(Sym("foo.h", FooHeader.range(), FooHeader.range()))); // Only preamble is built, and no AST is built in this request. Server.addDocument(FooCpp, FooWithoutHeader.code(), "null", @@ -1112,7 +1130,7 @@ TEST(LocateSymbol, WithPreamble) { // stale one. EXPECT_THAT( cantFail(runLocateSymbolAt(Server, FooCpp, FooWithoutHeader.point())), - ElementsAre(Sym("foo", FooWithoutHeader.range()))); + ElementsAre(Sym("foo", FooWithoutHeader.range(), llvm::None))); // Reset test environment. runAddDocument(Server, FooCpp, FooWithHeader.code()); @@ -1122,7 +1140,7 @@ TEST(LocateSymbol, WithPreamble) { // Use the AST being built in above request. EXPECT_THAT( cantFail(runLocateSymbolAt(Server, FooCpp, FooWithoutHeader.point())), - ElementsAre(Sym("foo", FooWithoutHeader.range()))); + ElementsAre(Sym("foo", FooWithoutHeader.range(), llvm::None))); } TEST(LocateSymbol, NearbyTokenSmoke) { @@ -1133,7 +1151,7 @@ TEST(LocateSymbol, NearbyTokenSmoke) { auto AST = TestTU::withCode(T.code()).build(); // We don't pass an index, so can't hit index-based fallback. EXPECT_THAT(locateSymbolAt(AST, T.point()), - ElementsAre(Sym("err", T.range()))); + ElementsAre(Sym("err", T.range(), T.range()))); } TEST(LocateSymbol, NearbyIdentifier) { @@ -1197,7 +1215,14 @@ TEST(LocateSymbol, NearbyIdentifier) { // h^i )cpp", - }; + R"cpp( + // prefer nearest occurrence even if several matched tokens + // have the same value of `floor(log2( - ))`. + int hello; + int x = hello, y = hello; + int z = [[hello]]; + // h^ello + )cpp"}; for (const char *Test : Tests) { Annotations T(Test); auto AST = TestTU::withCode(T.code()).build(); diff --git a/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp b/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp index 7db7c03d61c9b..ff819146621a2 100644 --- a/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp +++ b/clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp @@ -7,9 +7,11 @@ //===----------------------------------------------------------------------===// #include "../TestTU.h" +#include "Index.pb.h" #include "TestFS.h" #include "index/Index.h" #include "index/Ref.h" +#include "index/Relation.h" #include "index/Serialization.h" #include "index/Symbol.h" #include "index/SymbolID.h" @@ -18,6 +20,8 @@ #include "clang/Index/IndexSymbol.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Error.h" #include "llvm/Support/Path.h" #include "llvm/Support/StringSaver.h" #include "gmock/gmock.h" @@ -37,6 +41,51 @@ const char *testPathURI(llvm::StringRef Path, return Strings.save(URI.toString()).begin(); } +clangd::Symbol createSymbol(llvm::StringRef PathPrefix, + llvm::UniqueStringSaver &Strings) { + clangd::Symbol Sym; + Sym.ID = llvm::cantFail(SymbolID::fromStr("057557CEBF6E6B2D")); + + index::SymbolInfo Info; + Info.Kind = index::SymbolKind::Function; + Info.SubKind = index::SymbolSubKind::AccessorGetter; + Info.Lang = index::SymbolLanguage::CXX; + Info.Properties = static_cast( + index::SymbolProperty::TemplateSpecialization); + Sym.SymInfo = Info; + + Sym.Name = Strings.save("Foo"); + Sym.Scope = Strings.save("llvm::foo::bar::"); + + clangd::SymbolLocation Location; + Location.Start.setLine(1); + Location.Start.setColumn(15); + Location.End.setLine(3); + Location.End.setColumn(121); + Location.FileURI = testPathURI(PathPrefix.str() + "Definition.cpp", Strings); + Sym.Definition = Location; + + Location.Start.setLine(42); + Location.Start.setColumn(31); + Location.End.setLine(20); + Location.End.setColumn(400); + Location.FileURI = testPathURI(PathPrefix.str() + "Declaration.h", Strings); + Sym.CanonicalDeclaration = Location; + + Sym.References = 9000; + Sym.Origin = clangd::SymbolOrigin::Static; + Sym.Signature = Strings.save("(int X, char Y, Type T)"); + Sym.TemplateSpecializationArgs = Strings.save(""); + Sym.CompletionSnippetSuffix = + Strings.save("({1: int X}, {2: char Y}, {3: Type T})"); + Sym.Documentation = Strings.save("This is my amazing Foo constructor!"); + Sym.ReturnType = Strings.save("Foo"); + + Sym.Flags = clangd::Symbol::SymbolFlag::IndexedForCodeCompletion; + + return Sym; +} + TEST(RemoteMarshallingTest, URITranslation) { llvm::BumpPtrAllocator Arena; llvm::UniqueStringSaver Strings(Arena); @@ -49,11 +98,11 @@ TEST(RemoteMarshallingTest, URITranslation) { "clangd/unittests/remote/MarshallingTests.cpp", Strings); auto Serialized = ProtobufMarshaller.toProtobuf(Original); - EXPECT_TRUE(Serialized); + ASSERT_TRUE(bool(Serialized)); EXPECT_EQ(Serialized->location().file_path(), "clang-tools-extra/clangd/unittests/remote/MarshallingTests.cpp"); auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(bool(Deserialized)); EXPECT_STREQ(Deserialized->Location.FileURI, testPathURI("home/my-projects/llvm-project/clang-tools-extra/" "clangd/unittests/remote/MarshallingTests.cpp", @@ -62,22 +111,25 @@ TEST(RemoteMarshallingTest, URITranslation) { // Can't have empty paths. *Serialized->mutable_location()->mutable_file_path() = std::string(); Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_FALSE(Deserialized); + EXPECT_FALSE(bool(Deserialized)); + llvm::consumeError(Deserialized.takeError()); clangd::Ref WithInvalidURI; // Invalid URI results in serialization failure. WithInvalidURI.Location.FileURI = "This is not a URI"; - Serialized = ProtobufMarshaller.toProtobuf(WithInvalidURI); - EXPECT_FALSE(Serialized); + auto DeserializedRef = ProtobufMarshaller.toProtobuf(WithInvalidURI); + EXPECT_FALSE(bool(DeserializedRef)); + llvm::consumeError(DeserializedRef.takeError()); // Can not use URIs with scheme different from "file". auto UnittestURI = URI::create(testPath("project/lib/HelloWorld.cpp"), "unittest"); - EXPECT_TRUE(bool(UnittestURI)); + ASSERT_TRUE(bool(UnittestURI)); WithInvalidURI.Location.FileURI = Strings.save(UnittestURI->toString()).begin(); - Serialized = ProtobufMarshaller.toProtobuf(WithInvalidURI); - EXPECT_FALSE(Serialized); + auto DeserializedSymbol = ProtobufMarshaller.toProtobuf(WithInvalidURI); + EXPECT_FALSE(bool(DeserializedSymbol)); + llvm::consumeError(DeserializedSymbol.takeError()); // Paths transmitted over the wire can not be absolute, they have to be // relative. @@ -85,64 +137,23 @@ TEST(RemoteMarshallingTest, URITranslation) { *WithAbsolutePath.mutable_location()->mutable_file_path() = "/usr/local/user/home/HelloWorld.cpp"; Deserialized = ProtobufMarshaller.fromProtobuf(WithAbsolutePath); - EXPECT_FALSE(Deserialized); + EXPECT_FALSE(bool(Deserialized)); + llvm::consumeError(Deserialized.takeError()); } TEST(RemoteMarshallingTest, SymbolSerialization) { - clangd::Symbol Sym; - - auto ID = SymbolID::fromStr("057557CEBF6E6B2D"); - EXPECT_TRUE(bool(ID)); - Sym.ID = *ID; - - index::SymbolInfo Info; - Info.Kind = index::SymbolKind::Function; - Info.SubKind = index::SymbolSubKind::AccessorGetter; - Info.Lang = index::SymbolLanguage::CXX; - Info.Properties = static_cast( - index::SymbolProperty::TemplateSpecialization); - Sym.SymInfo = Info; - llvm::BumpPtrAllocator Arena; llvm::UniqueStringSaver Strings(Arena); - Sym.Name = Strings.save("Foo"); - Sym.Scope = Strings.save("llvm::foo::bar::"); - - clangd::SymbolLocation Location; - Location.Start.setLine(1); - Location.Start.setColumn(15); - Location.End.setLine(3); - Location.End.setColumn(121); - Location.FileURI = testPathURI("home/Definition.cpp", Strings); - Sym.Definition = Location; - - Location.Start.setLine(42); - Location.Start.setColumn(31); - Location.End.setLine(20); - Location.End.setColumn(400); - Location.FileURI = testPathURI("home/Declaration.h", Strings); - Sym.CanonicalDeclaration = Location; - - Sym.References = 9000; - Sym.Origin = clangd::SymbolOrigin::Static; - Sym.Signature = Strings.save("(int X, char Y, Type T)"); - Sym.TemplateSpecializationArgs = Strings.save(""); - Sym.CompletionSnippetSuffix = - Strings.save("({1: int X}, {2: char Y}, {3: Type T})"); - Sym.Documentation = Strings.save("This is my amazing Foo constructor!"); - Sym.ReturnType = Strings.save("Foo"); - - Sym.Flags = clangd::Symbol::SymbolFlag::IndexedForCodeCompletion; - + clangd::Symbol Sym = createSymbol("home/", Strings); Marshaller ProtobufMarshaller(testPath("home/"), testPath("home/")); // Check that symbols are exactly the same if the path to indexed project is // the same on indexing machine and the client. auto Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_TRUE(Serialized); + ASSERT_TRUE(bool(Serialized)); auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(bool(Deserialized)); EXPECT_EQ(toYAML(Sym), toYAML(*Deserialized)); // Serialized paths are relative and have UNIX slashes. EXPECT_EQ(convert_to_slash(Serialized->definition().file_path(), @@ -154,44 +165,44 @@ TEST(RemoteMarshallingTest, SymbolSerialization) { // Missing definition is OK. Sym.Definition = clangd::SymbolLocation(); Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_TRUE(Serialized); - Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(bool(Serialized)); + ASSERT_TRUE(bool(ProtobufMarshaller.fromProtobuf(*Serialized))); // Relative path is absolute. *Serialized->mutable_canonical_declaration()->mutable_file_path() = convert_to_slash("/path/to/Declaration.h"); Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_FALSE(Deserialized); + EXPECT_FALSE(bool(Deserialized)); + llvm::consumeError(Deserialized.takeError()); // Fail with an invalid URI. - Location.FileURI = "Not A URI"; - Sym.Definition = Location; + Sym.Definition.FileURI = "Not A URI"; Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_FALSE(Serialized); + EXPECT_FALSE(bool(Serialized)); + llvm::consumeError(Serialized.takeError()); // Schemes other than "file" can not be used. auto UnittestURI = URI::create(testPath("home/SomePath.h"), "unittest"); - EXPECT_TRUE(bool(UnittestURI)); - Location.FileURI = Strings.save(UnittestURI->toString()).begin(); - Sym.Definition = Location; + ASSERT_TRUE(bool(UnittestURI)); + Sym.Definition.FileURI = Strings.save(UnittestURI->toString()).begin(); Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_FALSE(Serialized); + EXPECT_FALSE(bool(Serialized)); + llvm::consumeError(Serialized.takeError()); // Passing root that is not prefix of the original file path. - Location.FileURI = testPathURI("home/File.h", Strings); - Sym.Definition = Location; + Sym.Definition.FileURI = testPathURI("home/File.h", Strings); // Check that the symbol is valid and passing the correct path works. Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_TRUE(Serialized); + ASSERT_TRUE(bool(Serialized)); Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(bool(Deserialized)); EXPECT_STREQ(Deserialized->Definition.FileURI, testPathURI("home/File.h", Strings)); // Fail with a wrong root. Marshaller WrongMarshaller(testPath("nothome/"), testPath("home/")); Serialized = WrongMarshaller.toProtobuf(Sym); EXPECT_FALSE(Serialized); + llvm::consumeError(Serialized.takeError()); } TEST(RemoteMarshallingTest, RefSerialization) { @@ -214,9 +225,9 @@ TEST(RemoteMarshallingTest, RefSerialization) { testPath("llvm-project/")); auto Serialized = ProtobufMarshaller.toProtobuf(Ref); - EXPECT_TRUE(Serialized); + ASSERT_TRUE(bool(Serialized)); auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + ASSERT_TRUE(bool(Deserialized)); EXPECT_EQ(toYAML(Ref), toYAML(*Deserialized)); } @@ -224,60 +235,84 @@ TEST(RemoteMarshallingTest, IncludeHeaderURIs) { llvm::BumpPtrAllocator Arena; llvm::UniqueStringSaver Strings(Arena); - llvm::SmallVector - ValidHeaders; + clangd::Symbol Sym = createSymbol("remote/", Strings); + clangd::Symbol::IncludeHeaderWithReferences Header; + // Add only valid headers. Header.IncludeHeader = Strings.save( URI::createFile("/usr/local/user/home/project/Header.h").toString()); Header.References = 21; - ValidHeaders.push_back(Header); + Sym.IncludeHeaders.push_back(Header); Header.IncludeHeader = Strings.save(""); Header.References = 100; - ValidHeaders.push_back(Header); + Sym.IncludeHeaders.push_back(Header); Header.IncludeHeader = Strings.save("\"cstdio\""); Header.References = 200; - ValidHeaders.push_back(Header); + Sym.IncludeHeaders.push_back(Header); + + Marshaller ProtobufMarshaller(convert_to_slash("/"), convert_to_slash("/")); + + auto Serialized = ProtobufMarshaller.toProtobuf(Sym); + ASSERT_TRUE(bool(Serialized)); + EXPECT_EQ(static_cast(Serialized->headers_size()), + Sym.IncludeHeaders.size()); + auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); + ASSERT_TRUE(bool(Deserialized)); + EXPECT_EQ(toYAML(Sym), toYAML(*Deserialized)); - llvm::SmallVector - InvalidHeaders; // This is an absolute path to a header: can not be transmitted over the wire. Header.IncludeHeader = Strings.save(testPath("project/include/Common.h")); Header.References = 42; - InvalidHeaders.push_back(Header); + Sym.IncludeHeaders.push_back(Header); + Serialized = ProtobufMarshaller.toProtobuf(Sym); + EXPECT_FALSE(bool(Serialized)); + llvm::consumeError(Serialized.takeError()); + + // Remove last invalid header. + Sym.IncludeHeaders.pop_back(); // This is not a valid header: can not be transmitted over the wire; Header.IncludeHeader = Strings.save("NotAHeader"); Header.References = 5; - InvalidHeaders.push_back(Header); + Sym.IncludeHeaders.push_back(Header); + Serialized = ProtobufMarshaller.toProtobuf(Sym); + EXPECT_FALSE(bool(Serialized)); + llvm::consumeError(Serialized.takeError()); - clangd::Symbol Sym; - // Fill in definition and declaration, Symbool will be invalid otherwise. - clangd::SymbolLocation Location; - Location.Start.setLine(1); - Location.Start.setColumn(2); - Location.End.setLine(3); - Location.End.setColumn(4); - Location.FileURI = testPathURI("File.h", Strings); - Sym.Definition = Location; - Sym.CanonicalDeclaration = Location; + // Try putting an invalid header into already serialized symbol. + Sym.IncludeHeaders.pop_back(); + Serialized = ProtobufMarshaller.toProtobuf(Sym); + ASSERT_TRUE(bool(Serialized)); + HeaderWithReferences InvalidHeader; + InvalidHeader.set_header(convert_to_slash("/absolute/path/Header.h")); + InvalidHeader.set_references(9000); + *Serialized->add_headers() = InvalidHeader; + Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); + EXPECT_FALSE(bool(Deserialized)); + llvm::consumeError(Deserialized.takeError()); +} - // Try to serialize all headers but only valid ones will end up in Protobuf - // message. - auto AllHeaders = ValidHeaders; - AllHeaders.insert(AllHeaders.end(), InvalidHeaders.begin(), - InvalidHeaders.end()); - Sym.IncludeHeaders = AllHeaders; +TEST(RemoteMarshallingTest, LookupRequestSerialization) { + clangd::LookupRequest Request; + Request.IDs.insert(llvm::cantFail(SymbolID::fromStr("0000000000000001"))); + Request.IDs.insert(llvm::cantFail(SymbolID::fromStr("0000000000000002"))); - Marshaller ProtobufMarshaller(convert_to_slash("/"), convert_to_slash("/")); + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); - auto Serialized = ProtobufMarshaller.toProtobuf(Sym); - EXPECT_EQ(static_cast(Serialized->headers_size()), - ValidHeaders.size()); - EXPECT_TRUE(Serialized); - auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); - EXPECT_TRUE(Deserialized); + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + EXPECT_EQ(static_cast(Serialized.ids_size()), Request.IDs.size()); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + ASSERT_TRUE(bool(Deserialized)); + EXPECT_EQ(Deserialized->IDs, Request.IDs); +} - Sym.IncludeHeaders = ValidHeaders; - EXPECT_EQ(toYAML(Sym), toYAML(*Deserialized)); +TEST(RemoteMarshallingTest, LookupRequestFailingSerialization) { + clangd::LookupRequest Request; + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + Serialized.add_ids("Invalid Symbol ID"); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + EXPECT_FALSE(bool(Deserialized)); + llvm::consumeError(Deserialized.takeError()); } TEST(RemoteMarshallingTest, FuzzyFindRequestSerialization) { @@ -289,19 +324,106 @@ TEST(RemoteMarshallingTest, FuzzyFindRequestSerialization) { auto Serialized = ProtobufMarshaller.toProtobuf(Request); EXPECT_EQ(Serialized.proximity_paths_size(), 2); auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); - EXPECT_THAT(Deserialized.ProximityPaths, + ASSERT_TRUE(bool(Deserialized)); + EXPECT_THAT(Deserialized->ProximityPaths, testing::ElementsAre(testPath("remote/Header.h"), testPath("remote/subdir/OtherHeader.h"))); } +TEST(RemoteMarshallingTest, RefsRequestSerialization) { + clangd::RefsRequest Request; + Request.IDs.insert(llvm::cantFail(SymbolID::fromStr("0000000000000001"))); + Request.IDs.insert(llvm::cantFail(SymbolID::fromStr("0000000000000002"))); + + Request.Limit = 9000; + Request.Filter = RefKind::Spelled | RefKind::Declaration; + + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + EXPECT_EQ(static_cast(Serialized.ids_size()), Request.IDs.size()); + EXPECT_EQ(Serialized.limit(), Request.Limit); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + ASSERT_TRUE(bool(Deserialized)); + EXPECT_EQ(Deserialized->IDs, Request.IDs); + ASSERT_TRUE(Deserialized->Limit); + EXPECT_EQ(*Deserialized->Limit, Request.Limit); + EXPECT_EQ(Deserialized->Filter, Request.Filter); +} + +TEST(RemoteMarshallingTest, RefsRequestFailingSerialization) { + clangd::RefsRequest Request; + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + Serialized.add_ids("Invalid Symbol ID"); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + EXPECT_FALSE(bool(Deserialized)); + llvm::consumeError(Deserialized.takeError()); +} + +TEST(RemoteMarshallingTest, RelationsRequestSerialization) { + clangd::RelationsRequest Request; + Request.Subjects.insert( + llvm::cantFail(SymbolID::fromStr("0000000000000001"))); + Request.Subjects.insert( + llvm::cantFail(SymbolID::fromStr("0000000000000002"))); + + Request.Limit = 9000; + Request.Predicate = RelationKind::BaseOf; + + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + + auto Serialized = ProtobufMarshaller.toProtobuf(Request); + EXPECT_EQ(static_cast(Serialized.subjects_size()), + Request.Subjects.size()); + EXPECT_EQ(Serialized.limit(), Request.Limit); + EXPECT_EQ(static_cast(Serialized.predicate()), + Request.Predicate); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + ASSERT_TRUE(bool(Deserialized)); + EXPECT_EQ(Deserialized->Subjects, Request.Subjects); + ASSERT_TRUE(Deserialized->Limit); + EXPECT_EQ(*Deserialized->Limit, Request.Limit); + EXPECT_EQ(Deserialized->Predicate, Request.Predicate); +} + +TEST(RemoteMarshallingTest, RelationsRequestFailingSerialization) { + RelationsRequest Serialized; + Serialized.add_subjects("ZZZZZZZZZZZZZZZZ"); + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + auto Deserialized = ProtobufMarshaller.fromProtobuf(&Serialized); + EXPECT_FALSE(bool(Deserialized)); + llvm::consumeError(Deserialized.takeError()); +} + +TEST(RemoteMarshallingTest, RelationsSerializion) { + llvm::BumpPtrAllocator Arena; + llvm::UniqueStringSaver Strings(Arena); + + clangd::Symbol Sym = createSymbol("remote/", Strings); + SymbolID ID = llvm::cantFail(SymbolID::fromStr("0000000000000002")); + Marshaller ProtobufMarshaller(testPath("remote/"), testPath("local/")); + auto Serialized = ProtobufMarshaller.toProtobuf(ID, Sym); + ASSERT_TRUE(bool(Serialized)); + auto Deserialized = ProtobufMarshaller.fromProtobuf(*Serialized); + ASSERT_TRUE(bool(Deserialized)); + EXPECT_THAT(Deserialized->first, ID); + EXPECT_THAT(Deserialized->second.ID, Sym.ID); +} + TEST(RemoteMarshallingTest, RelativePathToURITranslation) { Marshaller ProtobufMarshaller(/*RemoteIndexRoot=*/"", /*LocalIndexRoot=*/testPath("home/project/")); - EXPECT_TRUE(ProtobufMarshaller.relativePathToURI("lib/File.cpp")); + auto URIString = ProtobufMarshaller.relativePathToURI("lib/File.cpp"); + ASSERT_TRUE(bool(URIString)); // RelativePath can not be absolute. - EXPECT_FALSE(ProtobufMarshaller.relativePathToURI("/lib/File.cpp")); + URIString = ProtobufMarshaller.relativePathToURI("/lib/File.cpp"); + EXPECT_FALSE(bool(URIString)); + llvm::consumeError(URIString.takeError()); // RelativePath can not be empty. - EXPECT_FALSE(ProtobufMarshaller.relativePathToURI(std::string())); + URIString = ProtobufMarshaller.relativePathToURI(std::string()); + EXPECT_FALSE(bool(URIString)); + llvm::consumeError(URIString.takeError()); } TEST(RemoteMarshallingTest, URIToRelativePathTranslation) { @@ -309,14 +431,17 @@ TEST(RemoteMarshallingTest, URIToRelativePathTranslation) { llvm::UniqueStringSaver Strings(Arena); Marshaller ProtobufMarshaller(/*RemoteIndexRoot=*/testPath("remote/project/"), /*LocalIndexRoot=*/""); - EXPECT_TRUE(ProtobufMarshaller.uriToRelativePath( - testPathURI("remote/project/lib/File.cpp", Strings))); + auto RelativePath = ProtobufMarshaller.uriToRelativePath( + testPathURI("remote/project/lib/File.cpp", Strings)); + ASSERT_TRUE(bool(RelativePath)); // RemoteIndexRoot has to be be a prefix of the file path. Marshaller WrongMarshaller( /*RemoteIndexRoot=*/testPath("remote/other/project/"), /*LocalIndexRoot=*/""); - EXPECT_FALSE(WrongMarshaller.uriToRelativePath( - testPathURI("remote/project/lib/File.cpp", Strings))); + RelativePath = WrongMarshaller.uriToRelativePath( + testPathURI("remote/project/lib/File.cpp", Strings)); + EXPECT_FALSE(bool(RelativePath)); + llvm::consumeError(RelativePath.takeError()); } } // namespace diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 1d447938eae0c..89f6a50888ccc 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -67,7 +67,14 @@ The improvements are... Improvements to clang-tidy -------------------------- -The improvements are... +Changes in existing checks +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- Improved :doc:`readability-identifier-naming + ` check. + + Added an option `GetConfigPerFile` to support including files which use + different naming styles. Improvements to include-fixer ----------------------------- diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability-identifier-naming.rst b/clang-tools-extra/docs/clang-tidy/checks/readability-identifier-naming.rst index eefa5234fb211..9eec3c03f7d7d 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability-identifier-naming.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability-identifier-naming.rst @@ -51,6 +51,7 @@ The following options are describe below: - :option:`EnumCase`, :option:`EnumPrefix`, :option:`EnumSuffix` - :option:`EnumConstantCase`, :option:`EnumConstantPrefix`, :option:`EnumConstantSuffix` - :option:`FunctionCase`, :option:`FunctionPrefix`, :option:`FunctionSuffix` + - :option:`GetConfigPerFile` - :option:`GlobalConstantCase`, :option:`GlobalConstantPrefix`, :option:`GlobalConstantSuffix` - :option:`GlobalConstantPointerCase`, :option:`GlobalConstantPointerPrefix`, :option:`GlobalConstantPointerSuffix` - :option:`GlobalFunctionCase`, :option:`GlobalFunctionPrefix`, :option:`GlobalFunctionSuffix` @@ -713,6 +714,13 @@ After: char pre_my_function_string_post(); +.. option:: GetConfigPerFile + + When `true` the check will look for the configuration for where an + identifier is declared. Useful for when included header files use a + different style. + Default value is `true`. + .. option:: GlobalConstantCase When defined, the check will ensure global constant names conform to the diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style-disabled/.clang-tidy b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style-disabled/.clang-tidy new file mode 100644 index 0000000000000..6a704df8b7b19 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style-disabled/.clang-tidy @@ -0,0 +1,5 @@ +Checks: -readability-identifier-naming +CheckOptions: + - key: readability-identifier-naming.GlobalFunctionCase + value: lower_case + diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style-disabled/header.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style-disabled/header.h new file mode 100644 index 0000000000000..e863f70f7fcb2 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style-disabled/header.h @@ -0,0 +1,3 @@ +void disabled_style_1(); +void disabledStyle2(); +void DISABLED_STYLE_3(); diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style1/.clang-tidy b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style1/.clang-tidy new file mode 100644 index 0000000000000..85af9672b61d3 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style1/.clang-tidy @@ -0,0 +1,5 @@ +Checks: readability-identifier-naming +CheckOptions: + - key: readability-identifier-naming.GlobalFunctionCase + value: lower_case + diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style1/header.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style1/header.h new file mode 100644 index 0000000000000..b170bed7c3f60 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style1/header.h @@ -0,0 +1,5 @@ + + +void style_first_good(); + +void styleFirstBad(); diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style2/.clang-tidy b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style2/.clang-tidy new file mode 100644 index 0000000000000..b2e67ea9c87b5 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style2/.clang-tidy @@ -0,0 +1,5 @@ +Checks: readability-identifier-naming +CheckOptions: + - key: readability-identifier-naming.GlobalFunctionCase + value: UPPER_CASE + diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style2/header.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style2/header.h new file mode 100644 index 0000000000000..6b78ad82a1fdd --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/readability-identifier-naming/global-style2/header.h @@ -0,0 +1,5 @@ + + +void STYLE_SECOND_GOOD(); + +void styleSecondBad(); diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-string-integer-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-string-integer-assignment.cpp index 2c288e0bbddf9..8db47ea190d8f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone-string-integer-assignment.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-string-integer-assignment.cpp @@ -107,5 +107,5 @@ struct S { // CHECK-FIXES: {{^}} void f(char c) { s += std::to_string(c | static_cast(t)); } }; -template S; +template struct S; } diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-init-variables-conflict.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-init-variables-conflict.cpp new file mode 100644 index 0000000000000..4d67feb4161f8 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-init-variables-conflict.cpp @@ -0,0 +1,15 @@ +// RUN: %check_clang_tidy %s cppcoreguidelines-init-variables,readability-isolate-declaration %t + +void foo() { + int A, B, C; + // CHECK-MESSAGES-DAG: :[[@LINE-1]]:7: warning: variable 'A' is not initialized + // CHECK-MESSAGES-DAG: :[[@LINE-2]]:10: warning: variable 'B' is not initialized + // CHECK-MESSAGES-DAG: :[[@LINE-3]]:13: warning: variable 'C' is not initialized + // CHECK-MESSAGES-DAG: :[[@LINE-4]]:3: warning: multiple declarations in a single statement reduces readability + + // Only the isolate declarations fix-it should be applied + + // CHECK-FIXES: int A; + // CHECK-FIXES-NEXT: int B; + // CHECK-FIXES-NEXT: int C; +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/google-module.cpp b/clang-tools-extra/test/clang-tidy/checkers/google-module.cpp index 494ac48e17633..2c82237e4186d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google-module.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/google-module.cpp @@ -1,10 +1,6 @@ // RUN: clang-tidy -checks='-*,google*' -config='{}' -dump-config - -- | FileCheck %s // CHECK: CheckOptions: -// CHECK: {{- key: *google-readability-braces-around-statements.ShortStatementLines}} -// CHECK-NEXT: {{value: *'1'}} -// CHECK: {{- key: *google-readability-function-size.StatementThreshold}} -// CHECK-NEXT: {{value: *'800'}} -// CHECK: {{- key: *google-readability-namespace-comments.ShortNamespaceLines}} -// CHECK-NEXT: {{value: *'10'}} -// CHECK: {{- key: *google-readability-namespace-comments.SpacesBeforeComments}} -// CHECK-NEXT: {{value: *'2'}} +// CHECK-DAG: {{- key: *google-readability-braces-around-statements.ShortStatementLines *[[:space:]] *value: *'1'}} +// CHECK-DAG: {{- key: *google-readability-function-size.StatementThreshold *[[:space:]] *value: *'800'}} +// CHECK-DAG: {{- key: *google-readability-namespace-comments.ShortNamespaceLines *[[:space:]] *value: *'10'}} +// CHECK-DAG: {{- key: *google-readability-namespace-comments.SpacesBeforeComments *[[:space:]] *value: *'2'}} diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming-multiple-styles.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming-multiple-styles.cpp new file mode 100644 index 0000000000000..54880d2ca3d0a --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming-multiple-styles.cpp @@ -0,0 +1,64 @@ +// Setup header directory + +// RUN: rm -rf %theaders +// RUN: mkdir %theaders +// RUN: cp -R %S/Inputs/readability-identifier-naming/. %theaders + +// C++11 isn't explicitly required, but failing to specify a standard means the +// check will run multiple times for different standards. This will cause the +// second test to fail as the header file will be changed during the first run. +// InheritParentConfig is needed to look for the clang-tidy configuration files. + +// RUN: %check_clang_tidy -check-suffixes=ENABLED,SHARED -std=c++11 %s \ +// RUN: readability-identifier-naming %t -- \ +// RUN: -config='{ InheritParentConfig: true, CheckOptions: [ \ +// RUN: {key: readability-identifier-naming.FunctionCase, value: camelBack}, \ +// RUN: {key: readability-identifier-naming.GetConfigPerFile, value: true} \ +// RUN: ]}' -header-filter='.*' -- -I%theaders + +// On DISABLED run, everything should be made 'camelBack'. + +// RUN: cp -R %S/Inputs/readability-identifier-naming/. %theaders +// RUN: %check_clang_tidy -check-suffixes=DISABLED,SHARED -std=c++11 %s \ +// RUN: readability-identifier-naming %t -- \ +// RUN: -config='{ InheritParentConfig: true, CheckOptions: [ \ +// RUN: {key: readability-identifier-naming.FunctionCase, value: camelBack}, \ +// RUN: {key: readability-identifier-naming.GetConfigPerFile, value: false} \ +// RUN: ]}' -header-filter='.*' -- -I%theaders + +#include "global-style-disabled/header.h" +#include "global-style1/header.h" +#include "global-style2/header.h" +// CHECK-MESSAGES-ENABLED-DAG: global-style1/header.h:5:6: warning: invalid case style for global function 'styleFirstBad' +// CHECK-MESSAGES-ENABLED-DAG: global-style2/header.h:5:6: warning: invalid case style for global function 'styleSecondBad' +// CHECK-MESSAGES-DISABLED-DAG: global-style1/header.h:3:6: warning: invalid case style for function 'style_first_good' +// CHECK-MESSAGES-DISABLED-DAG: global-style2/header.h:3:6: warning: invalid case style for function 'STYLE_SECOND_GOOD' +// CHECK-MESSAGES-DISABLED-DAG: global-style-disabled/header.h:1:6: warning: invalid case style for function 'disabled_style_1' +// CHECK-MESSAGES-DISABLED-DAG: global-style-disabled/header.h:3:6: warning: invalid case style for function 'DISABLED_STYLE_3' + +void goodStyle() { + style_first_good(); + STYLE_SECOND_GOOD(); + // CHECK-FIXES-DISABLED: styleFirstGood(); + // CHECK-FIXES-DISABLED-NEXT: styleSecondGood(); +} +// CHECK-MESSAGES-SHARED-DAG: :[[@LINE+1]]:6: warning: invalid case style for function 'bad_style' +void bad_style() { + styleFirstBad(); + styleSecondBad(); +} +// CHECK-FIXES-SHARED: void badStyle() { +// CHECK-FIXES-DISABLED-NEXT: styleFirstBad(); +// CHECK-FIXES-ENABLED-NEXT: style_first_bad(); +// CHECK-FIXES-DISABLED-NEXT: styleSecondBad(); +// CHECK-FIXES-ENABLED-NEXT: STYLE_SECOND_BAD(); +// CHECK-FIXES-SHARED-NEXT: } + +void expectNoStyle() { + disabled_style_1(); + disabledStyle2(); + DISABLED_STYLE_3(); + // CHECK-FIXES-DISABLED: disabledStyle1(); + // CHECK-FIXES-DISABLED-NEXT: disabledStyle2(); + // CHECK-FIXES-DISABLED-NEXT: disabledStyle3(); +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp index 24c1c4270dec8..fed362bbecdec 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp @@ -578,3 +578,8 @@ void Foo() { #undef M1 #undef DUP } // namespace scratchspace + +template +auto GetRes(type_t& Param) -> decltype(Param.res()); +// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: invalid case style for parameter 'Param' +// CHECK-FIXES: auto GetRes(type_t& a_param) -> decltype(a_param.res()); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-redundant-string-cstr.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-redundant-string-cstr.cpp index 2561b81805bda..e1df8cccc10b3 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability-redundant-string-cstr.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability-redundant-string-cstr.cpp @@ -15,6 +15,8 @@ struct basic_string { basic_string(); basic_string(const C *p, const A &a = A()); + ~basic_string(); + const C *c_str() const; const C *data() const; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/config-files.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/config-files.cpp index d2a0a8c2a150f..d708ec8777c9a 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/config-files.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/config-files.cpp @@ -17,14 +17,10 @@ // For this test we have to use names of the real checks because otherwise values are ignored. // RUN: clang-tidy -dump-config %S/Inputs/config-files/4/44/- -- | FileCheck %s -check-prefix=CHECK-CHILD4 // CHECK-CHILD4: Checks: {{.*}}modernize-loop-convert,modernize-use-using,llvm-qualified-auto -// CHECK-CHILD4: - key: llvm-qualified-auto.AddConstToQualified -// CHECK-CHILD4-NEXT: value: 'true' -// CHECK-CHILD4: - key: modernize-loop-convert.MaxCopySize -// CHECK-CHILD4-NEXT: value: '20' -// CHECK-CHILD4: - key: modernize-loop-convert.MinConfidence -// CHECK-CHILD4-NEXT: value: reasonable -// CHECK-CHILD4: - key: modernize-use-using.IgnoreMacros -// CHECK-CHILD4-NEXT: value: 'false' +// CHECK-CHILD4-DAG: - key: llvm-qualified-auto.AddConstToQualified{{ *[[:space:]] *}}value: 'true' +// CHECK-CHILD4-DAG: - key: modernize-loop-convert.MaxCopySize{{ *[[:space:]] *}}value: '20' +// CHECK-CHILD4-DAG: - key: modernize-loop-convert.MinConfidence{{ *[[:space:]] *}}value: reasonable +// CHECK-CHILD4-DAG: - key: modernize-use-using.IgnoreMacros{{ *[[:space:]] *}}value: 'false' // RUN: clang-tidy --explain-config %S/Inputs/config-files/4/44/- -- | FileCheck %s -check-prefix=CHECK-EXPLAIN // CHECK-EXPLAIN: 'llvm-qualified-auto' is enabled in the {{.*}}{{[/\\]}}Inputs{{[/\\]}}config-files{{[/\\]}}4{{[/\\]}}44{{[/\\]}}.clang-tidy. @@ -37,16 +33,13 @@ // RUN: CheckOptions: [{key: modernize-loop-convert.MaxCopySize, value: 21}]}' \ // RUN: %S/Inputs/config-files/4/44/- -- | FileCheck %s -check-prefix=CHECK-CHILD5 // CHECK-CHILD5: Checks: {{.*}}modernize-loop-convert,modernize-use-using,llvm-qualified-auto,-llvm-qualified-auto -// CHECK-CHILD5: - key: modernize-loop-convert.MaxCopySize -// CHECK-CHILD5-NEXT: value: '21' -// CHECK-CHILD5: - key: modernize-loop-convert.MinConfidence -// CHECK-CHILD5-NEXT: value: reasonable -// CHECK-CHILD5: - key: modernize-use-using.IgnoreMacros -// CHECK-CHILD5-NEXT: value: 'false' +// CHECK-CHILD5-DAG: - key: modernize-loop-convert.MaxCopySize{{ *[[:space:]] *}}value: '21' +// CHECK-CHILD5-DAG: - key: modernize-loop-convert.MinConfidence{{ *[[:space:]] *}}value: reasonable +// CHECK-CHILD5-DAG: - key: modernize-use-using.IgnoreMacros{{ *[[:space:]] *}}value: 'false' // RUN: clang-tidy -dump-config \ // RUN: --config='{InheritParentConfig: false, \ // RUN: Checks: -llvm-qualified-auto}' \ // RUN: %S/Inputs/config-files/4/44/- -- | FileCheck %s -check-prefix=CHECK-CHILD6 -// CHECK-CHILD6: Checks: {{.*}}-llvm-qualified-auto +// CHECK-CHILD6: Checks: {{.*-llvm-qualified-auto'? *$}} // CHECK-CHILD6-NOT: - key: modernize-use-using.IgnoreMacros diff --git a/clang-tools-extra/unittests/CMakeLists.txt b/clang-tools-extra/unittests/CMakeLists.txt index 72abe0fa6d0c1..086a68e638307 100644 --- a/clang-tools-extra/unittests/CMakeLists.txt +++ b/clang-tools-extra/unittests/CMakeLists.txt @@ -5,10 +5,6 @@ function(add_extra_unittest test_dirname) add_unittest(ExtraToolsUnitTests ${test_dirname} ${ARGN}) endfunction() -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_subdirectory(clang-apply-replacements) add_subdirectory(clang-change-namespace) add_subdirectory(clang-doc) diff --git a/clang-tools-extra/unittests/clang-tidy/ClangTidyDiagnosticConsumerTest.cpp b/clang-tools-extra/unittests/clang-tidy/ClangTidyDiagnosticConsumerTest.cpp index 2a4ed64b88508..0894e5fd5eb91 100644 --- a/clang-tools-extra/unittests/clang-tidy/ClangTidyDiagnosticConsumerTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/ClangTidyDiagnosticConsumerTest.cpp @@ -6,6 +6,7 @@ namespace clang { namespace tidy { namespace test { +namespace { class TestCheck : public ClangTidyCheck { public: TestCheck(StringRef Name, ClangTidyContext *Context) @@ -20,6 +21,7 @@ class TestCheck : public ClangTidyCheck { diag(Var->getTypeSpecStartLoc(), "type specifier"); } }; +} // namespace TEST(ClangTidyDiagnosticConsumer, SortsErrors) { std::vector Errors; diff --git a/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp b/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp index 63f9a06e91bec..bfa594098fb71 100644 --- a/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/ClangTidyOptionsTest.cpp @@ -118,6 +118,7 @@ TEST(ParseConfiguration, MergeConfigurations) { EXPECT_TRUE(*Options.UseColor); } +namespace { class TestCheck : public ClangTidyCheck { public: TestCheck(ClangTidyContext *Context) : ClangTidyCheck("test", Context) {} @@ -140,6 +141,7 @@ class TestCheck : public ClangTidyCheck { return Options.getLocalOrGlobal(std::forward(Arguments)...); } }; +} // namespace #define CHECK_VAL(Value, Expected) \ do { \ @@ -222,8 +224,6 @@ TEST(CheckOptionsValidation, ValidIntOptions) { #undef CHECK_ERROR_INT } -// FIXME: Figure out why this test causes crashes on mac os. -#ifndef __APPLE__ TEST(ValidConfiguration, ValidEnumOptions) { ClangTidyOptions Options; @@ -275,7 +275,6 @@ TEST(ValidConfiguration, ValidEnumOptions) { #undef CHECK_ERROR_ENUM } -#endif #undef CHECK_VAL #undef CHECK_ERROR diff --git a/clang-tools-extra/unittests/clang-tidy/IncludeInserterTest.cpp b/clang-tools-extra/unittests/clang-tidy/IncludeInserterTest.cpp index ed5f02576f04a..e70d3fb91bf25 100644 --- a/clang-tools-extra/unittests/clang-tidy/IncludeInserterTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/IncludeInserterTest.cpp @@ -33,9 +33,7 @@ class IncludeInserterCheckBase : public ClangTidyCheck { void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) override { - Inserter = std::make_unique( - SM, getLangOpts(), utils::IncludeSorter::IS_Google); - PP->addPPCallbacks(Inserter->CreatePPCallbacks()); + Inserter.registerPreprocessor(PP); } void registerMatchers(ast_matchers::MatchFinder *Finder) override { @@ -46,15 +44,15 @@ class IncludeInserterCheckBase : public ClangTidyCheck { auto Diag = diag(Result.Nodes.getNodeAs("stmt")->getBeginLoc(), "foo, bar"); for (StringRef Header : HeadersToInclude()) { - Diag << Inserter->CreateIncludeInsertion( - Result.SourceManager->getMainFileID(), Header, IsAngledInclude()); + Diag << Inserter.createMainFileIncludeInsertion(Header, + IsAngledInclude()); } } virtual std::vector HeadersToInclude() const = 0; virtual bool IsAngledInclude() const = 0; - std::unique_ptr Inserter; + utils::IncludeInserter Inserter{utils::IncludeSorter::IS_Google}; }; class NonSystemHeaderInserterCheck : public IncludeInserterCheckBase { diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 5542250cb278a..90906ad789191 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -114,10 +114,6 @@ if( CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR ) option(CLANG_ENABLE_BOOTSTRAP "Generate the clang bootstrap target" OFF) option(LLVM_ENABLE_LIBXML2 "Use libxml2 if available." ON) - if(LLVM_ENABLE_ZLIB) - find_package(ZLIB) - endif() - include(AddLLVM) include(TableGen) include(HandleLLVMOptions) @@ -125,6 +121,8 @@ if( CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR ) include(LLVMDistributionSupport) set(PACKAGE_VERSION "${LLVM_PACKAGE_VERSION}") + set(BUG_REPORT_URL "${LLVM_PACKAGE_BUGREPORT}" CACHE STRING + "Default URL where bug reports are to be submitted.") if (NOT DEFINED LLVM_INCLUDE_TESTS) set(LLVM_INCLUDE_TESTS ON) @@ -261,7 +259,7 @@ set(DEFAULT_SYSROOT "" CACHE STRING set(ENABLE_LINKER_BUILD_ID OFF CACHE BOOL "pass --build-id to ld") -set(ENABLE_X86_RELAX_RELOCATIONS OFF CACHE BOOL +set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL "enable x86 relax relocations by default") set(ENABLE_EXPERIMENTAL_NEW_PASS_MANAGER FALSE CACHE BOOL @@ -760,6 +758,14 @@ if (CLANG_ENABLE_BOOTSTRAP) -DCMAKE_ASM_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/${C_COMPILER} -DCMAKE_ASM_COMPILER_ID=Clang) + # cmake requires CMAKE_LINKER to be specified if the compiler is MSVC-like, + # otherwise it defaults the linker to be link.exe. + if(BOOTSTRAP_LLVM_ENABLE_LLD) + if((WIN32 AND NOT BOOTSTRAP_CMAKE_SYSTEM_NAME) OR BOOTSTRAP_CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(${CLANG_STAGE}_LINKER -DCMAKE_LINKER=${LLVM_RUNTIME_OUTPUT_INTDIR}/lld-link${CMAKE_EXECUTABLE_SUFFIX}) + endif() + endif() + if(BOOTSTRAP_CMAKE_SYSTEM_NAME) set(${CLANG_STAGE}_CONFIG -DLLVM_CONFIG_PATH=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-config) set(${CLANG_STAGE}_TABLEGEN diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 259684ff2b0d9..e00b64073ca52 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -233,14 +233,17 @@ set(LLVM_TOOLCHAIN_TOOLS llvm-ar llvm-cov llvm-cxxfilt + llvm-dlltool llvm-dwarfdump llvm-dwp llvm-gsymutil llvm-lib + llvm-mt llvm-nm llvm-objcopy llvm-objdump llvm-profdata + llvm-rc llvm-ranlib llvm-readelf llvm-readobj diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst index 1613c8e453184..8eb010eae2659 100644 --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -1016,7 +1016,7 @@ Flags controlling how ``#include``\s are resolved to files. .. option:: -I, --include-directory , --include-directory= -Add directory to include search path +Add directory to include search path. If there are multiple -I options, these directories are searched in the order they are given before the standard system directories are searched. If the same directory is in the SYSTEM include search paths, for example if also specified with -isystem, the -I option will be ignored .. option:: -I-, --include-barrier diff --git a/clang/docs/DataFlowSanitizer.rst b/clang/docs/DataFlowSanitizer.rst index 44956037490a8..cd8c5869a017b 100644 --- a/clang/docs/DataFlowSanitizer.rst +++ b/clang/docs/DataFlowSanitizer.rst @@ -174,6 +174,58 @@ the correct labels are propagated. return 0; } +fast16labels mode +================= + +If you need 16 or fewer labels, you can use fast16labels instrumentation for +less CPU and code size overhead. To use fast16labels instrumentation, you'll +need to specify `-fsanitize=dataflow -mllvm -dfsan-fast-16-labels` in your +compile and link commands and use a modified API for creating and managing +labels. + +In fast16labels mode, base labels are simply 16-bit unsigned integers that are +powers of 2 (i.e. 1, 2, 4, 8, ..., 32768), and union labels are created by ORing +base labels. In this mode DFSan does not manage any label metadata, so the +functions `dfsan_create_label`, `dfsan_union`, `dfsan_get_label_info`, +`dfsan_has_label`, `dfsan_has_label_with_desc`, `dfsan_get_label_count`, and +`dfsan_dump_labels` are unsupported. Instead of using them, the user should +maintain any necessary metadata about base labels themselves. + +For example: + +.. code-block:: c++ + + #include + #include + + int main(void) { + int i = 100; + int j = 200; + int k = 300; + dfsan_label i_label = 1; + dfsan_label j_label = 2; + dfsan_label k_label = 4; + dfsan_set_label(i_label, &i, sizeof(i)); + dfsan_set_label(j_label, &j, sizeof(j)); + dfsan_set_label(k_label, &k, sizeof(k)); + + dfsan_label ij_label = dfsan_get_label(i + j); + + assert(ij_label & i_label); // ij_label has i_label + assert(ij_label & j_label); // ij_label has j_label + assert(!(ij_label & k_label)); // ij_label doesn't have k_label + assert(ij_label == 3) // Verifies all of the above + + dfsan_label ijk_label = dfsan_get_label(i + j + k); + + assert(ijk_label & i_label); // ijk_label has i_label + assert(ijk_label & j_label); // ijk_label has j_label + assert(ijk_label & k_label); // ijk_label has k_label + assert(ijk_label == 7); // Verifies all of the above + + return 0; + } + Current status ============== diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 26fbfab96bc8c..af5e538b1435a 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -264,9 +264,11 @@ want to help with the implementation. +==============================+==============================================================+==========================+=======================================================================+ | misc extension | user-defined function variants with #ifdef protection | :part:`worked on` | D71179 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| misc extension | default(firstprivate) & default(private) | :part:`worked on` | | +| misc extension | default(firstprivate) & default(private) | :part:`partial` | firstprivate done: D75591 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop extension | Loop tiling transformation | :part:`claimed` | | +| loop extension | Loop tiling transformation | :part:`worked on` | D76342 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| device extension | 'present' map type modifier | :part:`worked on` | D83061, D83062 | +| device extension | 'present' map type modifier | :part:`mostly done` | D83061, D83062, D84422 | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device extension | 'present' motion modifier | :good:`done` | D84711, D84712 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 9274081c4d62c..03eca8a26843e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -100,7 +100,10 @@ future versions of Clang. Modified Compiler Flags ----------------------- -- ... +- On ELF, ``-gz`` now defaults to ``-gz=zlib`` with the integrated assembler. + It produces ``SHF_COMPRESSED`` style compression of debug information. GNU + binutils 2.26 or newer, or lld is required to link produced object files. Use + ``-gz=zlib-gnu`` to get the old behavior. New Pragmas in Clang -------------------- diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index f6720c5492a58..ed33b2d6394f9 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1464,8 +1464,16 @@ class ASTContext : public RefCountedBase { void getInjectedTemplateArgs(const TemplateParameterList *Params, SmallVectorImpl &Args); + /// Form a pack expansion type with the given pattern. + /// \param NumExpansions The number of expansions for the pack, if known. + /// \param ExpectPackInType If \c false, we should not expect \p Pattern to + /// contain an unexpanded pack. This only makes sense if the pack + /// expansion is used in a context where the arity is inferred from + /// elsewhere, such as if the pattern contains a placeholder type or + /// if this is the canonical type of another pack expansion type. QualType getPackExpansionType(QualType Pattern, - Optional NumExpansions); + Optional NumExpansions, + bool ExpectPackInType = true); QualType getObjCInterfaceType(const ObjCInterfaceDecl *Decl, ObjCInterfaceDecl *PrevDecl = nullptr) const; diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 79645eddeb5cb..0cfc852016131 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -269,6 +269,19 @@ class NamedDecl : public Decl { /// Get the actual, stored name of the declaration, which may be a special /// name. + /// + /// Note that generally in diagnostics, the non-null \p NamedDecl* itself + /// should be sent into the diagnostic instead of using the result of + /// \p getDeclName(). + /// + /// A \p DeclarationName in a diagnostic will just be streamed to the output, + /// which will directly result in a call to \p DeclarationName::print. + /// + /// A \p NamedDecl* in a diagnostic will also ultimately result in a call to + /// \p DeclarationName::print, but with two customisation points along the + /// way (\p getNameForDiagnostic and \p printName). These are used to print + /// the template arguments if any, and to provide a user-friendly name for + /// some entities (such as unnamed variables and anonymous records). DeclarationName getDeclName() const { return Name; } /// Set the name of this declaration. diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 2b8d7e879a0aa..20f058b87e7f3 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -1025,7 +1025,7 @@ class CXXRecordDecl : public RecordDecl { } /// Set the captures for this lambda closure type. - void setCaptures(ArrayRef Captures); + void setCaptures(ASTContext &Context, ArrayRef Captures); /// For a closure type, retrieve the mapping from captured /// variables and \c this to the non-static data members that store the diff --git a/clang/include/clang/AST/ExprConcepts.h b/clang/include/clang/AST/ExprConcepts.h index 2a88ed5175d24..1544c498ef662 100644 --- a/clang/include/clang/AST/ExprConcepts.h +++ b/clang/include/clang/AST/ExprConcepts.h @@ -126,7 +126,11 @@ class ConceptSpecializationExpr final : public Expr, public ConceptReference, } SourceLocation getEndLoc() const LLVM_READONLY { - return ArgsAsWritten->RAngleLoc; + // If the ConceptSpecializationExpr is the ImmediatelyDeclaredConstraint + // of a TypeConstraint written syntactically as a constrained-parameter, + // there may not be a template argument list. + return ArgsAsWritten->RAngleLoc.isValid() ? ArgsAsWritten->RAngleLoc + : ConceptName.getEndLoc(); } // Iterators diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index c649502f765b4..5b588f4b5740d 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -6329,8 +6329,20 @@ class OMPToClause final : public OMPMappableExprListClause, friend OMPVarListClause; friend TrailingObjects; + /// Motion-modifiers for the 'to' clause. + OpenMPMotionModifierKind MotionModifiers[NumberOfOMPMotionModifiers] = { + OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown}; + + /// Location of motion-modifiers for the 'to' clause. + SourceLocation MotionModifiersLoc[NumberOfOMPMotionModifiers]; + + /// Colon location. + SourceLocation ColonLoc; + /// Build clause with number of variables \a NumVars. /// + /// \param TheMotionModifiers Motion-modifiers. + /// \param TheMotionModifiersLoc Locations of motion-modifiers. /// \param MapperQualifierLoc C++ nested name specifier for the associated /// user-defined mapper. /// \param MapperIdInfo The identifier of associated user-defined mapper. @@ -6342,13 +6354,24 @@ class OMPToClause final : public OMPMappableExprListClause, /// NumUniqueDeclarations: number of unique base declarations in this clause; /// 3) NumComponentLists: number of component lists in this clause; and 4) /// NumComponents: total number of expression components in the clause. - explicit OMPToClause(NestedNameSpecifierLoc MapperQualifierLoc, + explicit OMPToClause(ArrayRef TheMotionModifiers, + ArrayRef TheMotionModifiersLoc, + NestedNameSpecifierLoc MapperQualifierLoc, DeclarationNameInfo MapperIdInfo, const OMPVarListLocTy &Locs, const OMPMappableExprListSizeTy &Sizes) : OMPMappableExprListClause(llvm::omp::OMPC_to, Locs, Sizes, /*SupportsMapper=*/true, &MapperQualifierLoc, - &MapperIdInfo) {} + &MapperIdInfo) { + assert(llvm::array_lengthof(MotionModifiers) == TheMotionModifiers.size() && + "Unexpected number of motion modifiers."); + llvm::copy(TheMotionModifiers, std::begin(MotionModifiers)); + + assert(llvm::array_lengthof(MotionModifiersLoc) == + TheMotionModifiersLoc.size() && + "Unexpected number of motion modifier locations."); + llvm::copy(TheMotionModifiersLoc, std::begin(MotionModifiersLoc)); + } /// Build an empty clause. /// @@ -6361,6 +6384,29 @@ class OMPToClause final : public OMPMappableExprListClause, : OMPMappableExprListClause(llvm::omp::OMPC_to, OMPVarListLocTy(), Sizes, /*SupportsMapper=*/true) {} + /// Set motion-modifier for the clause. + /// + /// \param I index for motion-modifier. + /// \param T motion-modifier for the clause. + void setMotionModifier(unsigned I, OpenMPMotionModifierKind T) { + assert(I < NumberOfOMPMotionModifiers && + "Unexpected index to store motion modifier, exceeds array size."); + MotionModifiers[I] = T; + } + + /// Set location for the motion-modifier. + /// + /// \param I index for motion-modifier location. + /// \param TLoc motion-modifier location. + void setMotionModifierLoc(unsigned I, SourceLocation TLoc) { + assert(I < NumberOfOMPMotionModifiers && + "Index to store motion modifier location exceeds array size."); + MotionModifiersLoc[I] = TLoc; + } + + /// Set colon location. + void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; } + /// Define the sizes of each trailing object array except the last one. This /// is required for TrailingObjects to work properly. size_t numTrailingObjects(OverloadToken) const { @@ -6385,6 +6431,8 @@ class OMPToClause final : public OMPMappableExprListClause, /// \param Vars The original expression used in the clause. /// \param Declarations Declarations used in the clause. /// \param ComponentLists Component lists used in the clause. + /// \param MotionModifiers Motion-modifiers. + /// \param MotionModifiersLoc Location of motion-modifiers. /// \param UDMapperRefs References to user-defined mappers associated with /// expressions used in the clause. /// \param UDMQualifierLoc C++ nested name specifier for the associated @@ -6395,6 +6443,8 @@ class OMPToClause final : public OMPMappableExprListClause, ArrayRef Declarations, MappableExprComponentListsRef ComponentLists, ArrayRef UDMapperRefs, + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId); @@ -6409,6 +6459,38 @@ class OMPToClause final : public OMPMappableExprListClause, static OMPToClause *CreateEmpty(const ASTContext &C, const OMPMappableExprListSizeTy &Sizes); + /// Fetches the motion-modifier at 'Cnt' index of array of modifiers. + /// + /// \param Cnt index for motion-modifier. + OpenMPMotionModifierKind getMotionModifier(unsigned Cnt) const LLVM_READONLY { + assert(Cnt < NumberOfOMPMotionModifiers && + "Requested modifier exceeds the total number of modifiers."); + return MotionModifiers[Cnt]; + } + + /// Fetches the motion-modifier location at 'Cnt' index of array of modifiers' + /// locations. + /// + /// \param Cnt index for motion-modifier location. + SourceLocation getMotionModifierLoc(unsigned Cnt) const LLVM_READONLY { + assert(Cnt < NumberOfOMPMotionModifiers && + "Requested modifier location exceeds total number of modifiers."); + return MotionModifiersLoc[Cnt]; + } + + /// Fetches ArrayRef of motion-modifiers. + ArrayRef getMotionModifiers() const LLVM_READONLY { + return llvm::makeArrayRef(MotionModifiers); + } + + /// Fetches ArrayRef of location of motion-modifiers. + ArrayRef getMotionModifiersLoc() const LLVM_READONLY { + return llvm::makeArrayRef(MotionModifiersLoc); + } + + /// Get colon location. + SourceLocation getColonLoc() const { return ColonLoc; } + child_range children() { return child_range(reinterpret_cast(varlist_begin()), reinterpret_cast(varlist_end())); @@ -6449,8 +6531,20 @@ class OMPFromClause final friend OMPVarListClause; friend TrailingObjects; + /// Motion-modifiers for the 'from' clause. + OpenMPMotionModifierKind MotionModifiers[NumberOfOMPMotionModifiers] = { + OMPC_MOTION_MODIFIER_unknown, OMPC_MOTION_MODIFIER_unknown}; + + /// Location of motion-modifiers for the 'from' clause. + SourceLocation MotionModifiersLoc[NumberOfOMPMotionModifiers]; + + /// Colon location. + SourceLocation ColonLoc; + /// Build clause with number of variables \a NumVars. /// + /// \param TheMotionModifiers Motion-modifiers. + /// \param TheMotionModifiersLoc Locations of motion-modifiers. /// \param MapperQualifierLoc C++ nested name specifier for the associated /// user-defined mapper. /// \param MapperIdInfo The identifier of associated user-defined mapper. @@ -6462,13 +6556,24 @@ class OMPFromClause final /// NumUniqueDeclarations: number of unique base declarations in this clause; /// 3) NumComponentLists: number of component lists in this clause; and 4) /// NumComponents: total number of expression components in the clause. - explicit OMPFromClause(NestedNameSpecifierLoc MapperQualifierLoc, + explicit OMPFromClause(ArrayRef TheMotionModifiers, + ArrayRef TheMotionModifiersLoc, + NestedNameSpecifierLoc MapperQualifierLoc, DeclarationNameInfo MapperIdInfo, const OMPVarListLocTy &Locs, const OMPMappableExprListSizeTy &Sizes) : OMPMappableExprListClause(llvm::omp::OMPC_from, Locs, Sizes, /*SupportsMapper=*/true, &MapperQualifierLoc, - &MapperIdInfo) {} + &MapperIdInfo) { + assert(llvm::array_lengthof(MotionModifiers) == TheMotionModifiers.size() && + "Unexpected number of motion modifiers."); + llvm::copy(TheMotionModifiers, std::begin(MotionModifiers)); + + assert(llvm::array_lengthof(MotionModifiersLoc) == + TheMotionModifiersLoc.size() && + "Unexpected number of motion modifier locations."); + llvm::copy(TheMotionModifiersLoc, std::begin(MotionModifiersLoc)); + } /// Build an empty clause. /// @@ -6481,6 +6586,29 @@ class OMPFromClause final : OMPMappableExprListClause(llvm::omp::OMPC_from, OMPVarListLocTy(), Sizes, /*SupportsMapper=*/true) {} + /// Set motion-modifier for the clause. + /// + /// \param I index for motion-modifier. + /// \param T motion-modifier for the clause. + void setMotionModifier(unsigned I, OpenMPMotionModifierKind T) { + assert(I < NumberOfOMPMotionModifiers && + "Unexpected index to store motion modifier, exceeds array size."); + MotionModifiers[I] = T; + } + + /// Set location for the motion-modifier. + /// + /// \param I index for motion-modifier location. + /// \param TLoc motion-modifier location. + void setMotionModifierLoc(unsigned I, SourceLocation TLoc) { + assert(I < NumberOfOMPMotionModifiers && + "Index to store motion modifier location exceeds array size."); + MotionModifiersLoc[I] = TLoc; + } + + /// Set colon location. + void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; } + /// Define the sizes of each trailing object array except the last one. This /// is required for TrailingObjects to work properly. size_t numTrailingObjects(OverloadToken) const { @@ -6505,18 +6633,21 @@ class OMPFromClause final /// \param Vars The original expression used in the clause. /// \param Declarations Declarations used in the clause. /// \param ComponentLists Component lists used in the clause. + /// \param MotionModifiers Motion-modifiers. + /// \param MotionModifiersLoc Location of motion-modifiers. /// \param UDMapperRefs References to user-defined mappers associated with /// expressions used in the clause. /// \param UDMQualifierLoc C++ nested name specifier for the associated /// user-defined mapper. /// \param MapperId The identifier of associated user-defined mapper. - static OMPFromClause *Create(const ASTContext &C, const OMPVarListLocTy &Locs, - ArrayRef Vars, - ArrayRef Declarations, - MappableExprComponentListsRef ComponentLists, - ArrayRef UDMapperRefs, - NestedNameSpecifierLoc UDMQualifierLoc, - DeclarationNameInfo MapperId); + static OMPFromClause * + Create(const ASTContext &C, const OMPVarListLocTy &Locs, + ArrayRef Vars, ArrayRef Declarations, + MappableExprComponentListsRef ComponentLists, + ArrayRef UDMapperRefs, + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId); /// Creates an empty clause with the place for \a NumVars variables. /// @@ -6529,6 +6660,38 @@ class OMPFromClause final static OMPFromClause *CreateEmpty(const ASTContext &C, const OMPMappableExprListSizeTy &Sizes); + /// Fetches the motion-modifier at 'Cnt' index of array of modifiers. + /// + /// \param Cnt index for motion-modifier. + OpenMPMotionModifierKind getMotionModifier(unsigned Cnt) const LLVM_READONLY { + assert(Cnt < NumberOfOMPMotionModifiers && + "Requested modifier exceeds the total number of modifiers."); + return MotionModifiers[Cnt]; + } + + /// Fetches the motion-modifier location at 'Cnt' index of array of modifiers' + /// locations. + /// + /// \param Cnt index for motion-modifier location. + SourceLocation getMotionModifierLoc(unsigned Cnt) const LLVM_READONLY { + assert(Cnt < NumberOfOMPMotionModifiers && + "Requested modifier location exceeds total number of modifiers."); + return MotionModifiersLoc[Cnt]; + } + + /// Fetches ArrayRef of motion-modifiers. + ArrayRef getMotionModifiers() const LLVM_READONLY { + return llvm::makeArrayRef(MotionModifiers); + } + + /// Fetches ArrayRef of location of motion-modifiers. + ArrayRef getMotionModifiersLoc() const LLVM_READONLY { + return llvm::makeArrayRef(MotionModifiersLoc); + } + + /// Get colon location. + SourceLocation getColonLoc() const { return ColonLoc; } + child_range children() { return child_range(reinterpret_cast(varlist_begin()), reinterpret_cast(varlist_end())); @@ -7623,6 +7786,8 @@ class OMPClausePrinter final : public OMPClauseVisitor { /// Process clauses with list of variables. template void VisitOMPClauseList(T *Node, char StartSym); + /// Process motion clauses. + template void VisitOMPMotionClause(T *Node); public: OMPClausePrinter(raw_ostream &OS, const PrintingPolicy &Policy) @@ -7635,6 +7800,10 @@ class OMPClausePrinter final : public OMPClauseVisitor { struct OMPTraitProperty { llvm::omp::TraitProperty Kind = llvm::omp::TraitProperty::invalid; + + /// The raw string as we parsed it. This is needed for the `isa` trait set + /// (which accepts anything) and (later) extensions. + StringRef RawString; }; struct OMPTraitSelector { Expr *ScoreOrCondition = nullptr; @@ -7692,6 +7861,23 @@ class OMPTraitInfo { llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const OMPTraitInfo &TI); llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const OMPTraitInfo *TI); +/// Clang specific specialization of the OMPContext to lookup target features. +struct TargetOMPContext final : public llvm::omp::OMPContext { + + TargetOMPContext(ASTContext &ASTCtx, + std::function &&DiagUnknownTrait, + const FunctionDecl *CurrentFunctionDecl); + virtual ~TargetOMPContext() = default; + + /// See llvm::omp::OMPContext::matchesISATrait + bool matchesISATrait(StringRef RawString) const override; + +private: + std::function FeatureValidityCheck; + std::function DiagUnknownTrait; + llvm::StringMap FeatureMap; +}; + } // namespace clang #endif // LLVM_CLANG_AST_OPENMPCLAUSE_H diff --git a/clang/include/clang/AST/RecordLayout.h b/clang/include/clang/AST/RecordLayout.h index b259791af509d..946fbd8f4ce24 100644 --- a/clang/include/clang/AST/RecordLayout.h +++ b/clang/include/clang/AST/RecordLayout.h @@ -70,6 +70,11 @@ class ASTRecordLayout { // Alignment - Alignment of record in characters. CharUnits Alignment; + // PreferredAlignment - Preferred alignment of record in characters. This + // can be different than Alignment in cases where it is beneficial for + // performance or backwards compatibility preserving (e.g. AIX-ABI). + CharUnits PreferredAlignment; + // UnadjustedAlignment - Maximum of the alignments of the record members in // characters. CharUnits UnadjustedAlignment; @@ -91,6 +96,11 @@ class ASTRecordLayout { /// which is the alignment of the object without virtual bases. CharUnits NonVirtualAlignment; + /// PreferredNVAlignment - The preferred non-virtual alignment (in chars) of + /// an object, which is the preferred alignment of the object without + /// virtual bases. + CharUnits PreferredNVAlignment; + /// SizeOfLargestEmptySubobject - The size of the largest empty subobject /// (either a base or a member). Will be zero if the class doesn't contain /// any empty subobjects. @@ -139,30 +149,26 @@ class ASTRecordLayout { CXXRecordLayoutInfo *CXXInfo = nullptr; ASTRecordLayout(const ASTContext &Ctx, CharUnits size, CharUnits alignment, - CharUnits unadjustedAlignment, + CharUnits preferredAlignment, CharUnits unadjustedAlignment, CharUnits requiredAlignment, CharUnits datasize, ArrayRef fieldoffsets); using BaseOffsetsMapTy = CXXRecordLayoutInfo::BaseOffsetsMapTy; // Constructor for C++ records. - ASTRecordLayout(const ASTContext &Ctx, - CharUnits size, CharUnits alignment, - CharUnits unadjustedAlignment, - CharUnits requiredAlignment, - bool hasOwnVFPtr, bool hasExtendableVFPtr, - CharUnits vbptroffset, - CharUnits datasize, - ArrayRef fieldoffsets, + ASTRecordLayout(const ASTContext &Ctx, CharUnits size, CharUnits alignment, + CharUnits preferredAlignment, CharUnits unadjustedAlignment, + CharUnits requiredAlignment, bool hasOwnVFPtr, + bool hasExtendableVFPtr, CharUnits vbptroffset, + CharUnits datasize, ArrayRef fieldoffsets, CharUnits nonvirtualsize, CharUnits nonvirtualalignment, + CharUnits preferrednvalignment, CharUnits SizeOfLargestEmptySubobject, - const CXXRecordDecl *PrimaryBase, - bool IsPrimaryBaseVirtual, + const CXXRecordDecl *PrimaryBase, bool IsPrimaryBaseVirtual, const CXXRecordDecl *BaseSharingVBPtr, - bool EndsWithZeroSizedObject, - bool LeadsWithZeroSizedBase, - const BaseOffsetsMapTy& BaseOffsets, - const VBaseOffsetsMapTy& VBaseOffsets); + bool EndsWithZeroSizedObject, bool LeadsWithZeroSizedBase, + const BaseOffsetsMapTy &BaseOffsets, + const VBaseOffsetsMapTy &VBaseOffsets); ~ASTRecordLayout() = default; @@ -175,6 +181,10 @@ class ASTRecordLayout { /// getAlignment - Get the record alignment in characters. CharUnits getAlignment() const { return Alignment; } + /// getPreferredFieldAlignment - Get the record preferred alignment in + /// characters. + CharUnits getPreferredAlignment() const { return PreferredAlignment; } + /// getUnadjustedAlignment - Get the record alignment in characters, before /// alignment adjustement. CharUnits getUnadjustedAlignment() const { return UnadjustedAlignment; } @@ -193,9 +203,7 @@ class ASTRecordLayout { /// getDataSize() - Get the record data size, which is the record size /// without tail padding, in characters. - CharUnits getDataSize() const { - return DataSize; - } + CharUnits getDataSize() const { return DataSize; } /// getNonVirtualSize - Get the non-virtual size (in chars) of an object, /// which is the size of the object without virtual bases. @@ -205,14 +213,23 @@ class ASTRecordLayout { return CXXInfo->NonVirtualSize; } - /// getNonVirtualSize - Get the non-virtual alignment (in chars) of an object, - /// which is the alignment of the object without virtual bases. + /// getNonVirtualAlignment - Get the non-virtual alignment (in chars) of an + /// object, which is the alignment of the object without virtual bases. CharUnits getNonVirtualAlignment() const { assert(CXXInfo && "Record layout does not have C++ specific info!"); return CXXInfo->NonVirtualAlignment; } + /// getPreferredNVAlignment - Get the preferred non-virtual alignment (in + /// chars) of an object, which is the preferred alignment of the object + /// without virtual bases. + CharUnits getPreferredNVAlignment() const { + assert(CXXInfo && "Record layout does not have C++ specific info!"); + + return CXXInfo->PreferredNVAlignment; + } + /// getPrimaryBase - Get the primary base for this record. const CXXRecordDecl *getPrimaryBase() const { assert(CXXInfo && "Record layout does not have C++ specific info!"); @@ -287,9 +304,7 @@ class ASTRecordLayout { return !CXXInfo->VBPtrOffset.isNegative(); } - CharUnits getRequiredAlignment() const { - return RequiredAlignment; - } + CharUnits getRequiredAlignment() const { return RequiredAlignment; } bool endsWithZeroSizedObject() const { return CXXInfo && CXXInfo->EndsWithZeroSizedObject; diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index bc9559776d4e6..4cf06441fb7b1 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -483,7 +483,6 @@ class Qualifiers { // We also define global_device and global_host address spaces, // to distinguish global pointers allocated on host from pointers // allocated on device, which are a subset of __global. - // FIXME: add a reference to spec when ready (A == LangAS::opencl_global && (B == LangAS::opencl_global_device || B == LangAS::opencl_global_host)) || // Consider pointer size address spaces to be equivalent to default. @@ -4406,11 +4405,7 @@ class TypedefType : public Type { protected: friend class ASTContext; // ASTContext creates these. - TypedefType(TypeClass tc, const TypedefNameDecl *D, QualType can) - : Type(tc, can, can->getDependence() & ~TypeDependence::UnexpandedPack), - Decl(const_cast(D)) { - assert(!isa(can) && "Invalid canonical type"); - } + TypedefType(TypeClass tc, const TypedefNameDecl *D, QualType can); public: TypedefNameDecl *getDecl() const { return Decl; } @@ -5647,7 +5642,8 @@ class PackExpansionType : public Type, public llvm::FoldingSetNode { PackExpansionType(QualType Pattern, QualType Canon, Optional NumExpansions) : Type(PackExpansion, Canon, - (Pattern->getDependence() | TypeDependence::Instantiation) & + (Pattern->getDependence() | TypeDependence::Dependent | + TypeDependence::Instantiation) & ~TypeDependence::UnexpandedPack), Pattern(Pattern) { PackExpansionTypeBits.NumExpansions = @@ -5668,8 +5664,8 @@ class PackExpansionType : public Type, public llvm::FoldingSetNode { return None; } - bool isSugared() const { return !Pattern->isDependentType(); } - QualType desugar() const { return isSugared() ? Pattern : QualType(this, 0); } + bool isSugared() const { return false; } + QualType desugar() const { return QualType(this, 0); } void Profile(llvm::FoldingSetNodeID &ID) { Profile(ID, getPattern(), getNumExpansions()); diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 4b04bfa0194ca..80b66e7702441 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -722,7 +722,8 @@ let Class = PackExpansionType in { } def : Creator<[{ - return ctx.getPackExpansionType(pattern, numExpansions); + return ctx.getPackExpansionType(pattern, numExpansions, + /*ExpectPackInType*/false); }]>; } diff --git a/clang/include/clang/Analysis/Analyses/Dominators.h b/clang/include/clang/Analysis/Analyses/Dominators.h index 95a661138df4d..25a5ba9d83fe3 100644 --- a/clang/include/clang/Analysis/Analyses/Dominators.h +++ b/clang/include/clang/Analysis/Analyses/Dominators.h @@ -273,76 +273,6 @@ class ControlDependencyCalculator : public ManagedAnalysis { namespace llvm { -/// Clang's CFG contains nullpointers for unreachable succesors, e.g. when an -/// if statement's condition is always false, it's 'then' branch is represented -/// with a nullptr. This however will result in a nullpointer derefernece for -/// dominator tree calculation. -/// -/// To circumvent this, let's just crudely specialize the children getters -/// used in LLVM's dominator tree builder. -namespace DomTreeBuilder { - -using ClangCFGDomChildrenGetter = -SemiNCAInfo::ChildrenGetter< - /*Inverse=*/false>; - -template <> -template <> -inline ClangCFGDomChildrenGetter::ResultTy ClangCFGDomChildrenGetter::Get( - clang::CFGBlock *N, std::integral_constant) { - auto RChildren = reverse(children(N)); - ResultTy Ret(RChildren.begin(), RChildren.end()); - Ret.erase(std::remove(Ret.begin(), Ret.end(), nullptr), Ret.end()); - return Ret; -} - -using ClangCFGDomReverseChildrenGetter = -SemiNCAInfo::ChildrenGetter< - /*Inverse=*/true>; - -template <> -template <> -inline ClangCFGDomReverseChildrenGetter::ResultTy -ClangCFGDomReverseChildrenGetter::Get( - clang::CFGBlock *N, std::integral_constant) { - auto IChildren = inverse_children(N); - ResultTy Ret(IChildren.begin(), IChildren.end()); - Ret.erase(std::remove(Ret.begin(), Ret.end(), nullptr), Ret.end()); - return Ret; -} - -using ClangCFGPostDomChildrenGetter = -SemiNCAInfo::ChildrenGetter< - /*Inverse=*/false>; - -template <> -template <> -inline ClangCFGPostDomChildrenGetter::ResultTy -ClangCFGPostDomChildrenGetter::Get( - clang::CFGBlock *N, std::integral_constant) { - auto RChildren = reverse(children(N)); - ResultTy Ret(RChildren.begin(), RChildren.end()); - Ret.erase(std::remove(Ret.begin(), Ret.end(), nullptr), Ret.end()); - return Ret; -} - -using ClangCFGPostDomReverseChildrenGetter = -SemiNCAInfo::ChildrenGetter< - /*Inverse=*/true>; - -template <> -template <> -inline ClangCFGPostDomReverseChildrenGetter::ResultTy -ClangCFGPostDomReverseChildrenGetter::Get( - clang::CFGBlock *N, std::integral_constant) { - auto IChildren = inverse_children(N); - ResultTy Ret(IChildren.begin(), IChildren.end()); - Ret.erase(std::remove(Ret.begin(), Ret.end(), nullptr), Ret.end()); - return Ret; -} - -} // end of namespace DomTreeBuilder - //===------------------------------------- /// DominatorTree GraphTraits specialization so the DominatorTree can be /// iterable by generic graph iterators. diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 73dec5e69c61a..cb6b406ec41ce 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1326,12 +1326,12 @@ def OpenCLGlobalAddressSpace : TypeAttr { def OpenCLGlobalDeviceAddressSpace : TypeAttr { let Spellings = [Clang<"opencl_global_device">]; - let Documentation = [OpenCLGlobalAddressSpacesDocs]; + let Documentation = [OpenCLAddressSpaceGlobalExtDocs]; } def OpenCLGlobalHostAddressSpace : TypeAttr { let Spellings = [Clang<"opencl_global_host">]; - let Documentation = [OpenCLGlobalAddressSpacesDocs]; + let Documentation = [OpenCLAddressSpaceGlobalExtDocs]; } def OpenCLLocalAddressSpace : TypeAttr { diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 467b96793263f..a207af5933731 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3600,14 +3600,27 @@ scope) variables and static local variable as well. }]; } -def OpenCLGlobalAddressSpacesDocs : Documentation { +def OpenCLAddressSpaceGlobalExtDocs : Documentation { let Category = DocOpenCLAddressSpaces; let Heading = "[[clang::opencl_global_device]], [[clang::opencl_global_host]]"; let Content = [{ -The (global_device) and (global_host) address space attributes specify that an -object is allocated in global memory on the device/host. It helps distinguishing -USM pointers that access device memory and accessors that access global memory -from those that access host memory. +The ``global_device`` and ``global_host`` address space attributes specify that +an object is allocated in global memory on the device/host. It helps to +distinguish USM (Unified Shared Memory) pointers that access global device +memory from those that access global host memory. These new address spaces are +a subset of the ``__global/opencl_global`` address space, the full address space +set model for OpenCL 2.0 with the extension looks as follows: + generic->global->host + ->device + ->private + ->local + constant + +As ``global_device`` and ``global_host`` are a subset of +``__global/opencl_global`` address spaces it is allowed to convert +``global_device`` and ``global_host`` address spaces to +``__global/opencl_global`` address spaces (following ISO/IEC TR 18037 5.1.3 +"Address space nesting and rules for pointers). }]; } diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index ecee7782920f6..39f29740cf56d 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -169,14 +169,8 @@ TARGET_BUILTIN(__builtin_wasm_narrow_u_i8x16_i16x8, "V16cV8sV8s", "nc", "simd128 TARGET_BUILTIN(__builtin_wasm_narrow_s_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_narrow_u_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_s_i16x8_i8x16, "V8sV16c", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_s_i16x8_i8x16, "V8sV16c", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_u_i16x8_i8x16, "V8sV16c", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_u_i16x8_i8x16, "V8sV16c", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_s_i32x4_i16x8, "V4iV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_s_i32x4_i16x8, "V4iV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_u_i32x4_i16x8, "V4iV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_u_i32x4_i16x8, "V4iV8s", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_load32_zero, "V4ii*", "nU", "simd128") +TARGET_BUILTIN(__builtin_wasm_load64_zero, "V2LLiLLi*", "nU", "simd128") #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 389a0eede5c85..44de7b6dfd703 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1278,6 +1278,11 @@ def warn_omp_declare_variant_string_literal_or_identifier "%select{set|selector|property}0; " "%select{set|selector|property}0 skipped">, InGroup; +def warn_unknown_begin_declare_variant_isa_trait + : Warning<"isa trait '%0' is not known to the current target; verify the " + "spelling or consider restricting the context selector with the " + "'arch' selector further">, + InGroup; def note_omp_declare_variant_ctx_options : Note<"context %select{set|selector|property}0 options are: %1">; def warn_omp_declare_variant_expected diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 62926c9edaf16..ab5b3a73efd33 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -824,6 +824,7 @@ def ext_main_used : Extension< /// parser diagnostics def ext_no_declarators : ExtWarn<"declaration does not declare anything">, InGroup; +def err_no_declarators : Error<"declaration does not declare anything">; def ext_typedef_without_a_name : ExtWarn<"typedef requires a name">, InGroup; def err_typedef_not_identifier : Error<"typedef name must be an identifier">; @@ -9767,8 +9768,6 @@ def err_opencl_block_ref_block : Error< "cannot refer to a block inside block">; // OpenCL v2.0 s6.13.9 - Address space qualifier functions. -def err_opencl_builtin_to_addr_arg_num : Error< - "invalid number of arguments to function: %0">; def err_opencl_builtin_to_addr_invalid_arg : Error< "invalid argument %0 to function: %1, expecting a generic pointer argument">; @@ -9850,6 +9849,8 @@ def err_omp_multiple_array_items_in_map_clause : Error< "multiple array elements associated with the same variable are not allowed in map clauses of the same construct">; def err_omp_duplicate_map_type_modifier : Error< "same map type modifier has been specified more than once">; +def err_omp_duplicate_motion_modifier : Error< + "same motion modifier has been specified more than once">; def err_omp_pointer_mapped_along_with_derived_section : Error< "pointer cannot be mapped along with a section derived from itself">; def err_omp_original_storage_is_shared_and_does_not_contain : Error< @@ -10365,6 +10366,11 @@ def warn_nested_declare_variant : Warning<"nesting `omp begin/end declare variant` is not supported yet; " "nested context ignored">, InGroup; +def warn_unknown_declare_variant_isa_trait + : Warning<"isa trait '%0' is not known to the current target; verify the " + "spelling or consider restricting the context selector with the " + "'arch' selector further">, + InGroup; def err_omp_non_pointer_type_array_shaping_base : Error< "expected expression with a pointer to a complete type as a base of an array " "shaping operation">; diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index 275c4fcdabb25..59bb331a7a8a6 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -29,11 +29,8 @@ #ifndef OPENMP_MAP_MODIFIER_KIND #define OPENMP_MAP_MODIFIER_KIND(Name) #endif -#ifndef OPENMP_TO_MODIFIER_KIND -#define OPENMP_TO_MODIFIER_KIND(Name) -#endif -#ifndef OPENMP_FROM_MODIFIER_KIND -#define OPENMP_FROM_MODIFIER_KIND(Name) +#ifndef OPENMP_MOTION_MODIFIER_KIND +#define OPENMP_MOTION_MODIFIER_KIND(Name) #endif #ifndef OPENMP_DIST_SCHEDULE_KIND #define OPENMP_DIST_SCHEDULE_KIND(Name) @@ -126,11 +123,9 @@ OPENMP_MAP_MODIFIER_KIND(close) OPENMP_MAP_MODIFIER_KIND(mapper) OPENMP_MAP_MODIFIER_KIND(present) -// Modifiers for 'to' clause. -OPENMP_TO_MODIFIER_KIND(mapper) - -// Modifiers for 'from' clause. -OPENMP_FROM_MODIFIER_KIND(mapper) +// Modifiers for 'to' or 'from' clause. +OPENMP_MOTION_MODIFIER_KIND(mapper) +OPENMP_MOTION_MODIFIER_KIND(present) // Static attributes for 'dist_schedule' clause. OPENMP_DIST_SCHEDULE_KIND(static) @@ -163,8 +158,7 @@ OPENMP_REDUCTION_MODIFIER(task) #undef OPENMP_ATOMIC_DEFAULT_MEM_ORDER_KIND #undef OPENMP_MAP_KIND #undef OPENMP_MAP_MODIFIER_KIND -#undef OPENMP_TO_MODIFIER_KIND -#undef OPENMP_FROM_MODIFIER_KIND +#undef OPENMP_MOTION_MODIFIER_KIND #undef OPENMP_DIST_SCHEDULE_KIND #undef OPENMP_DEFAULTMAP_KIND #undef OPENMP_DEFAULTMAP_MODIFIER diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index dc6198f93f9d8..0f37dc9ad997d 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -86,21 +86,17 @@ enum OpenMPMapModifierKind { static constexpr unsigned NumberOfOMPMapClauseModifiers = OMPC_MAP_MODIFIER_last - OMPC_MAP_MODIFIER_unknown - 1; -/// OpenMP modifier kind for 'to' clause. -enum OpenMPToModifierKind { -#define OPENMP_TO_MODIFIER_KIND(Name) \ - OMPC_TO_MODIFIER_##Name, +/// OpenMP modifier kind for 'to' or 'from' clause. +enum OpenMPMotionModifierKind { +#define OPENMP_MOTION_MODIFIER_KIND(Name) \ + OMPC_MOTION_MODIFIER_##Name, #include "clang/Basic/OpenMPKinds.def" - OMPC_TO_MODIFIER_unknown + OMPC_MOTION_MODIFIER_unknown }; -/// OpenMP modifier kind for 'from' clause. -enum OpenMPFromModifierKind { -#define OPENMP_FROM_MODIFIER_KIND(Name) \ - OMPC_FROM_MODIFIER_##Name, -#include "clang/Basic/OpenMPKinds.def" - OMPC_FROM_MODIFIER_unknown -}; +/// Number of allowed motion-modifiers. +static constexpr unsigned NumberOfOMPMotionModifiers = + OMPC_MOTION_MODIFIER_unknown; /// OpenMP attributes for 'dist_schedule' clause. enum OpenMPDistScheduleClauseKind { diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 2ee3b16596302..004990ee31528 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -1401,6 +1401,9 @@ class TargetInfo : public virtual TransferrableTargetInfo, /// Whether target allows to overalign ABI-specified preferred alignment virtual bool allowsLargerPreferedTypeAlignment() const { return true; } + /// Whether target defaults to the `power` alignment rules of AIX. + virtual bool defaultsToAIXPowerAlignment() const { return false; } + /// Set supported OpenCL extensions and optional core features. virtual void setSupportedOpenCLOpts() {} diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index b50fa890e1f50..8634994bbfe68 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -758,10 +758,10 @@ ANNOTATION(non_type_undeclared) // annotation for an undeclared identifier that // was assumed to be an ADL-only function name ANNOTATION(non_type_dependent) // annotation for an assumed non-type member of // a dependent base class -ANNOTATION(primary_expr) // annotation for a primary expression -ANNOTATION( - uneval_primary_expr) // annotation for a primary expression which should be - // transformed to potentially evaluated +ANNOTATION(overload_set) // annotation for an unresolved overload set +ANNOTATION(primary_expr) // annotation for a primary expression, used when + // tentatively parsing a lambda init-capture or ObjC + // message send ANNOTATION(decltype) // annotation for a decltype expression, // e.g., "decltype(foo.bar())" diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td index a4e3002b90753..011394c3ef455 100644 --- a/clang/include/clang/Basic/TypeNodes.td +++ b/clang/include/clang/Basic/TypeNodes.td @@ -100,7 +100,7 @@ def DeducedTemplateSpecializationType : TypeNode; def InjectedClassNameType : TypeNode, AlwaysDependent, LeafType; def DependentNameType : TypeNode, AlwaysDependent; def DependentTemplateSpecializationType : TypeNode, AlwaysDependent; -def PackExpansionType : TypeNode, NeverCanonicalUnlessDependent; +def PackExpansionType : TypeNode, AlwaysDependent; def ObjCTypeParamType : TypeNode, NeverCanonical; def ObjCObjectType : TypeNode; def ObjCInterfaceType : TypeNode, LeafType; diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Driver/Options.h index 7c5cddd9e8960..9831efda4e580 100644 --- a/clang/include/clang/Driver/Options.h +++ b/clang/include/clang/Driver/Options.h @@ -33,7 +33,8 @@ enum ClangFlags { CC1Option = (1 << 10), CC1AsOption = (1 << 11), NoDriverOption = (1 << 12), - Ignored = (1 << 13) + LinkOption = (1 << 13), + Ignored = (1 << 14), }; enum ID { diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4bbea6b0cf19c..bb0caccb0553d 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -52,6 +52,10 @@ def CC1AsOption : OptionFlag; // NoDriverOption - This option should not be accepted by the driver. def NoDriverOption : OptionFlag; +// If an option affects linking, but has a primary group (so Link_Group cannot +// be used), add this flag. +def LinkOption : OptionFlag; + // A short name to show in documentation. The name will be interpreted as rST. class DocName { string DocName = name; } @@ -393,7 +397,12 @@ def I_ : Flag<["-"], "I-">, Group, "remove current directory from include path">; def I : JoinedOrSeparate<["-"], "I">, Group, Flags<[CC1Option,CC1AsOption]>, MetaVarName<"">, - HelpText<"Add directory to include search path">; + HelpText<"Add directory to include search path. If there are multiple -I " + "options, these directories are searched in the order they are " + "given before the standard system directories are searched. " + "If the same directory is in the SYSTEM include search paths, for " + "example if also specified with -isystem, the -I option will be " + "ignored">; def L : JoinedOrSeparate<["-"], "L">, Flags<[RenderJoined]>, Group, MetaVarName<"">, HelpText<"Add directory to library search path">; def MD : Flag<["-"], "MD">, Group, @@ -592,7 +601,7 @@ def config_system_dir_EQ : Joined<["--"], "config-system-dir=">, Flags<[DriverOp HelpText<"System directory for configuration files">; def config_user_dir_EQ : Joined<["--"], "config-user-dir=">, Flags<[DriverOption, HelpHidden]>, HelpText<"User directory for configuration files">; -def coverage : Flag<["-", "--"], "coverage">, Flags<[CoreOption]>; +def coverage : Flag<["-", "--"], "coverage">, Group, Flags<[CoreOption]>; def cpp_precomp : Flag<["-"], "cpp-precomp">, Group; def current__version : JoinedOrSeparate<["-"], "current_version">; def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group, @@ -687,6 +696,9 @@ def dependency_dot : Separate<["-"], "dependency-dot">, Flags<[CC1Option]>, HelpText<"Filename to write DOT-formatted header dependencies to">; def module_dependency_dir : Separate<["-"], "module-dependency-dir">, Flags<[CC1Option]>, HelpText<"Directory to dump module dependencies to">; +def dsym_dir : JoinedOrSeparate<["-"], "dsym-dir">, + Flags<[DriverOption, RenderAsInput]>, + HelpText<"Directory to output dSYM's (if any) to">, MetaVarName<"">; def dumpmachine : Flag<["-"], "dumpmachine">; def dumpspecs : Flag<["-"], "dumpspecs">, Flags<[Unsupported]>; def dumpversion : Flag<["-"], "dumpversion">; @@ -708,7 +720,7 @@ def emit_merged_ifs : Flag<["-"], "emit-merged-ifs">, HelpText<"Generate Interface Stub Files, emit merged text not binary.">; def interface_stub_version_EQ : JoinedOrSeparate<["-"], "interface-stub-version=">, Flags<[CC1Option]>; def exported__symbols__list : Separate<["-"], "exported_symbols_list">; -def e : JoinedOrSeparate<["-"], "e">, Group; +def e : JoinedOrSeparate<["-"], "e">, Flags<[LinkerInput]>, Group; def fmax_tokens_EQ : Joined<["-"], "fmax-tokens=">, Group, Flags<[CC1Option]>, HelpText<"Max total number of preprocessed tokens for -Wmax-tokens.">; def fPIC : Flag<["-"], "fPIC">, Group; @@ -1259,7 +1271,7 @@ def finline_functions : Flag<["-"], "finline-functions">, Group, def finline_hint_functions: Flag<["-"], "finline-hint-functions">, Group, Flags<[CC1Option]>, HelpText<"Inline functions which are (explicitly or implicitly) marked inline">; def finline : Flag<["-"], "finline">, Group; -def fglobal_isel : Flag<["-"], "fglobal-isel">, Group, +def fglobal_isel : Flag<["-"], "fglobal-isel">, Group, HelpText<"Enables the global instruction selector">; def fexperimental_isel : Flag<["-"], "fexperimental-isel">, Group, Alias; @@ -1768,7 +1780,7 @@ def fpass_plugin_EQ : Joined<["-"], "fpass-plugin=">, HelpText<"Load pass plugin from a dynamic shared object file (only with new pass manager).">; defm preserve_as_comments : OptOutFFlag<"preserve-as-comments", "", "Do not preserve comments in inline assembly">; -def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group; +def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group, Flags<[LinkOption]>; def fno_profile_arcs : Flag<["-"], "fno-profile-arcs">, Group; def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>; def frandom_seed_EQ : Joined<["-"], "frandom-seed=">, Group; @@ -2509,6 +2521,10 @@ def mdirect_move : Flag<["-"], "mdirect-move">, Group; def mnodirect_move : Flag<["-"], "mno-direct-move">, Group; +def mpaired_vector_memops: Flag<["-"], "mpaired-vector-memops">, + Group; +def mnopaired_vector_memops: Flag<["-"], "mno-paired-vector-memops">, + Group; def mhtm : Flag<["-"], "mhtm">, Group; def mno_htm : Flag<["-"], "mno-htm">, Group; def mfprnd : Flag<["-"], "mfprnd">, Group; @@ -2784,7 +2800,7 @@ def nostdinc : Flag<["-"], "nostdinc">, Flags<[CoreOption]>; def nostdlibinc : Flag<["-"], "nostdlibinc">; def nostdincxx : Flag<["-"], "nostdinc++">, Flags<[CC1Option]>, HelpText<"Disable standard #include directories for the C++ standard library">; -def nostdlib : Flag<["-"], "nostdlib">; +def nostdlib : Flag<["-"], "nostdlib">, Group; def nostdlibxx : Flag<["-"], "nostdlib++">; def nolibsycl : Flag<["-"], "nolibsycl">, Flags<[DriverOption, CoreOption]>, HelpText<"Do not link SYCL runtime library">; @@ -2830,15 +2846,15 @@ def pthread : Flag<["-"], "pthread">, Flags<[CC1Option]>, HelpText<"Support POSIX threads in generated code">; def no_pthread : Flag<["-"], "no-pthread">, Flags<[CC1Option]>; def p : Flag<["-"], "p">; -def pie : Flag<["-"], "pie">; -def static_pie : Flag<["-"], "static-pie">; +def pie : Flag<["-"], "pie">, Group; +def static_pie : Flag<["-"], "static-pie">, Group; def read__only__relocs : Separate<["-"], "read_only_relocs">; def remap : Flag<["-"], "remap">; def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[DriverOption,CC1Option]>, HelpText<"Rewrite Objective-C source to C++">, Group; def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">, Flags<[DriverOption]>, HelpText<"Rewrite Legacy Objective-C source to C++">; -def rdynamic : Flag<["-"], "rdynamic">; +def rdynamic : Flag<["-"], "rdynamic">, Group; def resource_dir : Separate<["-"], "resource-dir">, Flags<[DriverOption, CC1Option, CoreOption, HelpHidden]>, HelpText<"The directory which holds the compiler resource files">; @@ -2880,13 +2896,13 @@ def segs__read__only__addr : Separate<["-"], "segs_read_only_addr">; def segs__read__write__addr : Separate<["-"], "segs_read_write_addr">; def segs__read__ : Joined<["-"], "segs_read_">; def shared_libgcc : Flag<["-"], "shared-libgcc">; -def shared : Flag<["-", "--"], "shared">; +def shared : Flag<["-", "--"], "shared">, Group; def single__module : Flag<["-"], "single_module">; def specs_EQ : Joined<["-", "--"], "specs=">; def specs : Separate<["-", "--"], "specs">, Flags<[Unsupported]>; def static_libgcc : Flag<["-"], "static-libgcc">; def static_libstdcxx : Flag<["-"], "static-libstdc++">; -def static : Flag<["-", "--"], "static">, Flags<[NoArgumentUnused]>; +def static : Flag<["-", "--"], "static">, Group, Flags<[NoArgumentUnused]>; def std_default_EQ : Joined<["-"], "std-default=">; def std_EQ : Joined<["-", "--"], "std=">, Flags<[CC1Option]>, Group, HelpText<"Language standard to compile for">, @@ -3345,8 +3361,8 @@ defm : BooleanFFlag<"keep-inline-functions">, Group, Group; def falign_labels_EQ : Joined<["-"], "falign-labels=">, Group; diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 5cd017fa925fd..b0dd363555ab1 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -419,6 +419,9 @@ class Preprocessor { /// The number of (LexLevel 0) preprocessor tokens. unsigned TokenCount = 0; + /// Preprocess every token regardless of LexLevel. + bool PreprocessToken = false; + /// The maximum number of (LexLevel 0) tokens before issuing a -Wmax-tokens /// warning, or zero for unlimited. unsigned MaxTokens = 0; @@ -1038,6 +1041,8 @@ class Preprocessor { OnToken = std::move(F); } + void setPreprocessToken(bool Preprocess) { PreprocessToken = Preprocess; } + bool isMacroDefined(StringRef Id) { return isMacroDefined(&Identifiers.get(Id)); } diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 1b091dc083f16..2e79c5023c270 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -3229,6 +3229,9 @@ class Parser : public CodeCompletionHandler { MapTypeModifiers; SmallVector MapTypeModifiersLoc; + SmallVector + MotionModifiers; + SmallVector MotionModifiersLoc; bool IsMapTypeImplicit = false; SourceLocation ExtraModifierLoc; }; diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h index 8db03babfb1e9..93a9126096557 100644 --- a/clang/include/clang/Sema/DeclSpec.h +++ b/clang/include/clang/Sema/DeclSpec.h @@ -2435,10 +2435,21 @@ class Declarator { return true; return false; } + /// Get the trailing return type appearing (at any level) within this + /// declarator. + ParsedType getTrailingReturnType() const { + for (const auto &Chunk : type_objects()) + if (Chunk.Kind == DeclaratorChunk::Function && + Chunk.Fun.hasTrailingReturnType()) + return Chunk.Fun.getTrailingReturnType(); + return ParsedType(); + } /// \brief Sets a trailing requires clause for this declarator. void setTrailingRequiresClause(Expr *TRC) { TrailingRequiresClause = TRC; + + SetRangeEnd(TRC->getEndLoc()); } /// \brief Sets a trailing requires clause for this declarator. diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index e8b8f2859cc6b..40ad35cc84576 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -2271,9 +2271,11 @@ class Sema final { /// resolved. ActOnNameClassifiedAsDependentNonType should be called to /// convert the result to an expression. NC_DependentNonType, - /// The name was classified as a non-type, and an expression representing - /// that name has been formed. - NC_ContextIndependentExpr, + /// The name was classified as an overload set, and an expression + /// representing that overload set has been formed. + /// ActOnNameClassifiedAsOverloadSet should be called to form a suitable + /// expression referencing the overload set. + NC_OverloadSet, /// The name was classified as a template whose specializations are types. NC_TypeTemplate, /// The name was classified as a variable template name. @@ -2310,8 +2312,8 @@ class Sema final { return NameClassification(NC_Unknown); } - static NameClassification ContextIndependentExpr(ExprResult E) { - NameClassification Result(NC_ContextIndependentExpr); + static NameClassification OverloadSet(ExprResult E) { + NameClassification Result(NC_OverloadSet); Result.Expr = E; return Result; } @@ -2363,7 +2365,7 @@ class Sema final { NameClassificationKind getKind() const { return Kind; } ExprResult getExpression() const { - assert(Kind == NC_ContextIndependentExpr); + assert(Kind == NC_OverloadSet); return Expr; } @@ -2443,6 +2445,8 @@ class Sema final { NamedDecl *Found, SourceLocation NameLoc, const Token &NextToken); + /// Act on the result of classifying a name as an overload set. + ExprResult ActOnNameClassifiedAsOverloadSet(Scope *S, Expr *OverloadSet); /// Describes the detailed kind of a template name. Used in diagnostics. enum class TemplateNameKindForDiagnostics { @@ -5001,11 +5005,10 @@ class Sema final { Expr *baseObjectExpr = nullptr, SourceLocation opLoc = SourceLocation()); - ExprResult BuildPossibleImplicitMemberExpr(const CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - LookupResult &R, - const TemplateArgumentListInfo *TemplateArgs, - const Scope *S); + ExprResult BuildPossibleImplicitMemberExpr( + const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, LookupResult &R, + const TemplateArgumentListInfo *TemplateArgs, const Scope *S, + UnresolvedLookupExpr *AsULE = nullptr); ExprResult BuildImplicitMemberExpr(const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, LookupResult &R, @@ -8331,6 +8334,8 @@ class Sema final { /// Completely replace the \c auto in \p TypeWithAuto by /// \p Replacement. This does not retain any \c auto type sugar. QualType ReplaceAutoType(QualType TypeWithAuto, QualType Replacement); + TypeSourceInfo *ReplaceAutoTypeSourceInfo(TypeSourceInfo *TypeWithAuto, + QualType Replacement); /// Result type of DeduceAutoType. enum DeduceAutoResult { @@ -10896,7 +10901,9 @@ class Sema final { DeclarationNameInfo &ReductionOrMapperId, int ExtraModifier, ArrayRef MapTypeModifiers, ArrayRef MapTypeModifiersLoc, bool IsMapTypeImplicit, - SourceLocation ExtraModifierLoc); + SourceLocation ExtraModifierLoc, + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc); /// Called on well-formed 'inclusive' clause. OMPClause *ActOnOpenMPInclusiveClause(ArrayRef VarList, SourceLocation StartLoc, @@ -11033,15 +11040,20 @@ class Sema final { SourceLocation KindLoc, SourceLocation EndLoc); /// Called on well-formed 'to' clause. OMPClause * - ActOnOpenMPToClause(ArrayRef VarList, CXXScopeSpec &MapperIdScopeSpec, - DeclarationNameInfo &MapperId, - const OMPVarListLocTy &Locs, + ActOnOpenMPToClause(ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, + DeclarationNameInfo &MapperId, SourceLocation ColonLoc, + ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers = llvm::None); /// Called on well-formed 'from' clause. - OMPClause *ActOnOpenMPFromClause( - ArrayRef VarList, CXXScopeSpec &MapperIdScopeSpec, - DeclarationNameInfo &MapperId, const OMPVarListLocTy &Locs, - ArrayRef UnresolvedMappers = llvm::None); + OMPClause * + ActOnOpenMPFromClause(ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, + DeclarationNameInfo &MapperId, SourceLocation ColonLoc, + ArrayRef VarList, const OMPVarListLocTy &Locs, + ArrayRef UnresolvedMappers = llvm::None); /// Called on well-formed 'use_device_ptr' clause. OMPClause *ActOnOpenMPUseDevicePtrClause(ArrayRef VarList, const OMPVarListLocTy &Locs); diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h index 1abe297820886..a561ac67bf786 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h @@ -80,7 +80,7 @@ class SVal { #define ABSTRACT_SVAL_WITH_KIND(Id, Parent) Id ## Kind, #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.def" }; - enum { BaseBits = 2, BaseMask = 0x3 }; + enum { BaseBits = 2, BaseMask = 0b11 }; protected: const void *Data = nullptr; @@ -116,7 +116,7 @@ class SVal { unsigned getRawKind() const { return Kind; } BaseKind getBaseKind() const { return (BaseKind) (Kind & BaseMask); } - unsigned getSubKind() const { return (Kind & ~BaseMask) >> BaseBits; } + unsigned getSubKind() const { return Kind >> BaseBits; } // This method is required for using SVal in a FoldingSetNode. It // extracts a unique signature for this SVal object. @@ -182,12 +182,6 @@ class SVal { /// should continue to the base regions if the region is not symbolic. SymbolRef getAsSymbol(bool IncludeBaseRegions = false) const; - /// getAsSymbolicExpression - If this Sval wraps a symbolic expression then - /// return that expression. Otherwise return NULL. - const SymExpr *getAsSymbolicExpression() const; - - const SymExpr *getAsSymExpr() const; - const MemRegion *getAsRegion() const; /// printJson - Pretty-prints in JSON format. diff --git a/clang/include/clang/Tooling/Syntax/Tree.h b/clang/include/clang/Tooling/Syntax/Tree.h index bc581004c46e6..dd51e0a740708 100644 --- a/clang/include/clang/Tooling/Syntax/Tree.h +++ b/clang/include/clang/Tooling/Syntax/Tree.h @@ -39,7 +39,7 @@ namespace syntax { class Arena { public: Arena(SourceManager &SourceMgr, const LangOptions &LangOpts, - TokenBuffer Tokens); + const TokenBuffer &Tokens); const SourceManager &sourceManager() const { return SourceMgr; } const LangOptions &langOptions() const { return LangOpts; } @@ -56,7 +56,7 @@ class Arena { private: SourceManager &SourceMgr; const LangOptions &LangOpts; - TokenBuffer Tokens; + const TokenBuffer &Tokens; /// IDs and storage for additional tokenized files. llvm::DenseMap> ExtraTokens; /// Keeps all the allocated nodes and their intermediate data structures. diff --git a/clang/include/clang/Tooling/Transformer/MatchConsumer.h b/clang/include/clang/Tooling/Transformer/MatchConsumer.h index f407ffce3d252..cb0a5f684b7d6 100644 --- a/clang/include/clang/Tooling/Transformer/MatchConsumer.h +++ b/clang/include/clang/Tooling/Transformer/MatchConsumer.h @@ -99,11 +99,5 @@ llvm::Expected MatchComputation::eval( return Output; } } // namespace transformer - -namespace tooling { -// DEPRECATED: Temporary alias supporting client migration to the `transformer` -// namespace. -using transformer::ifBound; -} // namespace tooling } // namespace clang #endif // LLVM_CLANG_TOOLING_TRANSFORMER_MATCH_CONSUMER_H_ diff --git a/clang/include/clang/Tooling/Transformer/RangeSelector.h b/clang/include/clang/Tooling/Transformer/RangeSelector.h index 2807037bc208f..e070c0e7e2e6b 100644 --- a/clang/include/clang/Tooling/Transformer/RangeSelector.h +++ b/clang/include/clang/Tooling/Transformer/RangeSelector.h @@ -56,6 +56,11 @@ RangeSelector before(RangeSelector Selector); /// * the TokenRange [B,E'] where the token at E' spans the range [E',E). RangeSelector after(RangeSelector Selector); +/// Selects the range between `R1` and `R2. +inline RangeSelector between(RangeSelector R1, RangeSelector R2) { + return enclose(after(std::move(R1)), before(std::move(R2))); +} + /// Selects a node, including trailing semicolon (for non-expression /// statements). \p ID is the node's binding in the match result. RangeSelector node(std::string ID); diff --git a/clang/include/clang/Tooling/Transformer/RewriteRule.h b/clang/include/clang/Tooling/Transformer/RewriteRule.h index 1be5727364608..2a26d32817dde 100644 --- a/clang/include/clang/Tooling/Transformer/RewriteRule.h +++ b/clang/include/clang/Tooling/Transformer/RewriteRule.h @@ -107,9 +107,42 @@ struct ASTEdit { /// clients. We recommend use of the \c AtomicChange or \c Replacements classes /// for assistance in detecting such conflicts. EditGenerator editList(llvm::SmallVector Edits); -// Convenience form of `editList` for a single edit. +/// Convenience form of `editList` for a single edit. EditGenerator edit(ASTEdit); +/// Convenience generator for a no-op edit generator. +inline EditGenerator noEdits() { return editList({}); } + +/// Convenience version of `ifBound` specialized to `ASTEdit`. +inline EditGenerator ifBound(std::string ID, ASTEdit TrueEdit, + ASTEdit FalseEdit) { + return ifBound(std::move(ID), edit(std::move(TrueEdit)), + edit(std::move(FalseEdit))); +} + +/// Convenience version of `ifBound` that has no "False" branch. If the node is +/// not bound, then no edits are produced. +inline EditGenerator ifBound(std::string ID, ASTEdit TrueEdit) { + return ifBound(std::move(ID), edit(std::move(TrueEdit)), noEdits()); +} + +/// Flattens a list of generators into a single generator whose elements are the +/// concatenation of the results of the argument generators. +EditGenerator flattenVector(SmallVector Generators); + +namespace detail { +/// Convenience function to construct an \c EditGenerator. Overloaded for common +/// cases so that user doesn't need to specify which factory function to +/// use. This pattern gives benefits similar to implicit constructors, while +/// maintaing a higher degree of explicitness. +inline EditGenerator injectEdits(ASTEdit E) { return edit(std::move(E)); } +inline EditGenerator injectEdits(EditGenerator G) { return G; } +} // namespace detail + +template EditGenerator flatten(Ts &&...Edits) { + return flattenVector({detail::injectEdits(std::forward(Edits))...}); +} + /// Format of the path in an include directive -- angle brackets or quotes. enum class IncludeFormat { Quoted, @@ -291,6 +324,31 @@ inline ASTEdit withMetadata(ASTEdit Edit, Callable Metadata) { return Edit; } +/// Assuming that the inner range is enclosed by the outer range, creates +/// precision edits to remove the parts of the outer range that are not included +/// in the inner range. +inline EditGenerator shrinkTo(RangeSelector outer, RangeSelector inner) { + return editList({remove(enclose(before(outer), before(inner))), + remove(enclose(after(inner), after(outer)))}); +} + +/// Applies `Rule` to all descendants of the node bound to `NodeId`. `Rule` can +/// refer to nodes bound by the calling rule. `Rule` is not applied to the node +/// itself. +/// +/// For example, +/// ``` +/// auto InlineX = +/// makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); +/// makeRule(functionDecl(hasName("f"), hasBody(stmt().bind("body"))).bind("f"), +/// flatten( +/// changeTo(name("f"), cat("newName")), +/// rewriteDescendants("body", InlineX))); +/// ``` +/// Here, we find the function `f`, change its name to `newName` and change all +/// appearances of `x` in its body to `3`. +EditGenerator rewriteDescendants(std::string NodeId, RewriteRule Rule); + /// The following three functions are a low-level part of the RewriteRule /// API. We expose them for use in implementing the fixtures that interpret /// RewriteRule, like Transformer and TransfomerTidy, or for more advanced diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 886fdd8c6eb6c..ac63c88977eec 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -925,14 +925,14 @@ static const LangASMap *getAddressSpaceMap(const TargetInfo &T, 2, // opencl_constant 0, // opencl_private 4, // opencl_generic - 11, // opencl_global_device - 12, // opencl_global_host - 5, // cuda_device - 6, // cuda_constant - 7, // cuda_shared - 8, // ptr32_sptr - 9, // ptr32_uptr - 10 // ptr64 + 5, // opencl_global_device + 6, // opencl_global_host + 7, // cuda_device + 8, // cuda_constant + 9, // cuda_shared + 10, // ptr32_sptr + 11, // ptr32_uptr + 12 // ptr64 }; return &FakeAddrSpaceMap; } else { @@ -2453,8 +2453,8 @@ CharUnits ASTContext::getTypeUnadjustedAlignInChars(const Type *T) const { /// getPreferredTypeAlign - Return the "preferred" alignment of the specified /// type for the current target in bits. This can be different than the ABI -/// alignment in cases where it is beneficial for performance to overalign -/// a data type. +/// alignment in cases where it is beneficial for performance or backwards +/// compatibility preserving to overalign a data type. unsigned ASTContext::getPreferredTypeAlign(const Type *T) const { TypeInfo TI = getTypeInfo(T); unsigned ABIAlign = TI.Align; @@ -2464,18 +2464,33 @@ unsigned ASTContext::getPreferredTypeAlign(const Type *T) const { // The preferred alignment of member pointers is that of a pointer. if (T->isMemberPointerType()) return getPreferredTypeAlign(getPointerDiffType().getTypePtr()); - + if (!Target->allowsLargerPreferedTypeAlignment()) return ABIAlign; - // Double and long long should be naturally aligned if possible. + if (const auto *RT = T->getAs()) { + if (TI.AlignIsRequired) + return ABIAlign; + + unsigned PreferredAlign = static_cast( + toBits(getASTRecordLayout(RT->getDecl()).PreferredAlignment)); + assert(PreferredAlign >= ABIAlign && + "PreferredAlign should be at least as large as ABIAlign."); + return PreferredAlign; + } + + // Double (and, for targets supporting AIX `power` alignment, long double) and + // long long should be naturally aligned (despite requiring less alignment) if + // possible. if (const auto *CT = T->getAs()) T = CT->getElementType().getTypePtr(); if (const auto *ET = T->getAs()) T = ET->getDecl()->getIntegerType().getTypePtr(); if (T->isSpecificBuiltinType(BuiltinType::Double) || T->isSpecificBuiltinType(BuiltinType::LongLong) || - T->isSpecificBuiltinType(BuiltinType::ULongLong)) + T->isSpecificBuiltinType(BuiltinType::ULongLong) || + (T->isSpecificBuiltinType(BuiltinType::LongDouble) && + Target->defaultsToAIXPowerAlignment())) // Don't increase the alignment if an alignment attribute was specified on a // typedef declaration. if (!TI.AlignIsRequired) @@ -4816,37 +4831,27 @@ ASTContext::getInjectedTemplateArgs(const TemplateParameterList *Params, } QualType ASTContext::getPackExpansionType(QualType Pattern, - Optional NumExpansions) { + Optional NumExpansions, + bool ExpectPackInType) { + assert((!ExpectPackInType || Pattern->containsUnexpandedParameterPack()) && + "Pack expansions must expand one or more parameter packs"); + llvm::FoldingSetNodeID ID; PackExpansionType::Profile(ID, Pattern, NumExpansions); - // A deduced type can deduce to a pack, eg - // auto ...x = some_pack; - // That declaration isn't (yet) valid, but is created as part of building an - // init-capture pack: - // [...x = some_pack] {} - assert((Pattern->containsUnexpandedParameterPack() || - Pattern->getContainedDeducedType()) && - "Pack expansions must expand one or more parameter packs"); void *InsertPos = nullptr; - PackExpansionType *T - = PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos); + PackExpansionType *T = PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos); if (T) return QualType(T, 0); QualType Canon; if (!Pattern.isCanonical()) { - Canon = getCanonicalType(Pattern); - // The canonical type might not contain an unexpanded parameter pack, if it - // contains an alias template specialization which ignores one of its - // parameters. - if (Canon->containsUnexpandedParameterPack()) { - Canon = getPackExpansionType(Canon, NumExpansions); - - // Find the insert position again, in case we inserted an element into - // PackExpansionTypes and invalidated our insert position. - PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos); - } + Canon = getPackExpansionType(getCanonicalType(Pattern), NumExpansions, + /*ExpectPackInType=*/false); + + // Find the insert position again, in case we inserted an element into + // PackExpansionTypes and invalidated our insert position. + PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos); } T = new (*this, TypeAlignment) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 6623900aff28d..59fa8b0a2351c 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1504,7 +1504,8 @@ ASTNodeImporter::VisitPackExpansionType(const PackExpansionType *T) { return ToPatternOrErr.takeError(); return Importer.getToContext().getPackExpansionType(*ToPatternOrErr, - T->getNumExpansions()); + T->getNumExpansions(), + /*ExpactPack=*/false); } ExpectedType ASTNodeImporter::VisitDependentTemplateSpecializationType( @@ -1906,7 +1907,8 @@ Error ASTNodeImporter::ImportDefinition( else return ToCaptureOrErr.takeError(); } - cast(To)->setCaptures(ToCaptures); + cast(To)->setCaptures(Importer.getToContext(), + ToCaptures); } Error Result = ImportDeclContext(From, /*ForceImport=*/true); @@ -4764,11 +4766,10 @@ Error ASTNodeImporter::ImportDefinition( return ToImplOrErr.takeError(); } - if (shouldForceImportDeclContext(Kind)) { - // Import all of the members of this class. - if (Error Err = ImportDeclContext(From, /*ForceImport=*/true)) - return Err; - } + // Import all of the members of this class. + if (Error Err = ImportDeclContext(From, /*ForceImport=*/true)) + return Err; + return Error::success(); } diff --git a/clang/lib/AST/ASTImporterLookupTable.cpp b/clang/lib/AST/ASTImporterLookupTable.cpp index 4d6fff8f34191..e17d6082dcdcc 100644 --- a/clang/lib/AST/ASTImporterLookupTable.cpp +++ b/clang/lib/AST/ASTImporterLookupTable.cpp @@ -22,6 +22,20 @@ namespace { struct Builder : RecursiveASTVisitor { ASTImporterLookupTable < Builder(ASTImporterLookupTable <) : LT(LT) {} + + bool VisitTypedefNameDecl(TypedefNameDecl *D) { + QualType Ty = D->getUnderlyingType(); + Ty = Ty.getCanonicalType(); + if (const auto *RTy = dyn_cast(Ty)) { + LT.add(RTy->getAsRecordDecl()); + // iterate over the field decls, adding them + for (auto *it : RTy->getAsRecordDecl()->fields()) { + LT.add(it); + } + } + return true; + } + bool VisitNamedDecl(NamedDecl *D) { LT.add(D); return true; diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 6f1fd2f14edeb..59ae5cb300f72 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -1383,8 +1383,8 @@ void CXXRecordDecl::finishedDefaultedOrDeletedMember(CXXMethodDecl *D) { data().DeclaredNonTrivialSpecialMembers |= SMKind; } -void CXXRecordDecl::setCaptures(ArrayRef Captures) { - ASTContext &Context = getASTContext(); +void CXXRecordDecl::setCaptures(ASTContext &Context, + ArrayRef Captures) { CXXRecordDecl::LambdaDefinitionData &Data = getLambdaData(); // Copy captures. diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 006dc043b067a..46d24934a6966 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -2388,8 +2388,8 @@ void CXXNameMangler::mangleQualifiers(Qualifiers Quals, const DependentAddressSp switch (AS) { default: llvm_unreachable("Not a language specific address space"); // ::= "CL" [ "global" | "local" | "constant" | - // "private"| "generic" | "global_device" | - // "global_host" ] + // "private"| "generic" | "device" | + // "host" ] case LangAS::opencl_global: ASString = "CLglobal"; break; diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 0f01164f2f199..7477e3fdcbb63 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -1798,7 +1798,7 @@ void MicrosoftCXXNameMangler::mangleAddressSpaceType(QualType T, // where: // ::= | // ::= "CL" [ "global" | "local" | "constant" | - // "private"| "generic" ] + // "private"| "generic" | "device" | "host" ] // ::= "CU" [ "device" | "constant" | "shared" ] // Note that the above were chosen to match the Itanium mangling for this. // diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index 6933c57425520..e846d325560d0 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -17,6 +17,7 @@ #include "clang/AST/DeclOpenMP.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/OpenMPKinds.h" +#include "clang/Basic/TargetInfo.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" @@ -1097,6 +1098,8 @@ OMPToClause *OMPToClause::Create( const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef Vars, ArrayRef Declarations, MappableExprComponentListsRef ComponentLists, ArrayRef UDMapperRefs, + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId) { OMPMappableExprListSizeTy Sizes; Sizes.NumVars = Vars.size(); @@ -1121,7 +1124,8 @@ OMPToClause *OMPToClause::Create( Sizes.NumUniqueDeclarations + Sizes.NumComponentLists, Sizes.NumComponents)); - auto *Clause = new (Mem) OMPToClause(UDMQualifierLoc, MapperId, Locs, Sizes); + auto *Clause = new (Mem) OMPToClause(MotionModifiers, MotionModifiersLoc, + UDMQualifierLoc, MapperId, Locs, Sizes); Clause->setVarRefs(Vars); Clause->setUDMapperRefs(UDMapperRefs); @@ -1144,6 +1148,8 @@ OMPFromClause *OMPFromClause::Create( const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef Vars, ArrayRef Declarations, MappableExprComponentListsRef ComponentLists, ArrayRef UDMapperRefs, + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, NestedNameSpecifierLoc UDMQualifierLoc, DeclarationNameInfo MapperId) { OMPMappableExprListSizeTy Sizes; Sizes.NumVars = Vars.size(); @@ -1169,7 +1175,8 @@ OMPFromClause *OMPFromClause::Create( Sizes.NumComponents)); auto *Clause = - new (Mem) OMPFromClause(UDMQualifierLoc, MapperId, Locs, Sizes); + new (Mem) OMPFromClause(MotionModifiers, MotionModifiersLoc, + UDMQualifierLoc, MapperId, Locs, Sizes); Clause->setVarRefs(Vars); Clause->setUDMapperRefs(UDMapperRefs); @@ -1936,6 +1943,17 @@ void OMPClausePrinter::VisitOMPDependClause(OMPDependClause *Node) { OS << ")"; } +template +static void PrintMapper(raw_ostream &OS, T *Node, + const PrintingPolicy &Policy) { + OS << '('; + NestedNameSpecifier *MapperNNS = + Node->getMapperQualifierLoc().getNestedNameSpecifier(); + if (MapperNNS) + MapperNNS->print(OS, Policy); + OS << Node->getMapperIdInfo() << ')'; +} + void OMPClausePrinter::VisitOMPMapClause(OMPMapClause *Node) { if (!Node->varlist_empty()) { OS << "map("; @@ -1944,14 +1962,8 @@ void OMPClausePrinter::VisitOMPMapClause(OMPMapClause *Node) { if (Node->getMapTypeModifier(I) != OMPC_MAP_MODIFIER_unknown) { OS << getOpenMPSimpleClauseTypeName(OMPC_map, Node->getMapTypeModifier(I)); - if (Node->getMapTypeModifier(I) == OMPC_MAP_MODIFIER_mapper) { - OS << '('; - NestedNameSpecifier *MapperNNS = - Node->getMapperQualifierLoc().getNestedNameSpecifier(); - if (MapperNNS) - MapperNNS->print(OS, Policy); - OS << Node->getMapperIdInfo() << ')'; - } + if (Node->getMapTypeModifier(I) == OMPC_MAP_MODIFIER_mapper) + PrintMapper(OS, Node, Policy); OS << ','; } } @@ -1963,44 +1975,41 @@ void OMPClausePrinter::VisitOMPMapClause(OMPMapClause *Node) { } } -void OMPClausePrinter::VisitOMPToClause(OMPToClause *Node) { - if (!Node->varlist_empty()) { - OS << "to"; - DeclarationNameInfo MapperId = Node->getMapperIdInfo(); - if (MapperId.getName() && !MapperId.getName().isEmpty()) { - OS << '('; - OS << "mapper("; - NestedNameSpecifier *MapperNNS = - Node->getMapperQualifierLoc().getNestedNameSpecifier(); - if (MapperNNS) - MapperNNS->print(OS, Policy); - OS << MapperId << "):"; - VisitOMPClauseList(Node, ' '); - } else { - VisitOMPClauseList(Node, '('); +template void OMPClausePrinter::VisitOMPMotionClause(T *Node) { + if (Node->varlist_empty()) + return; + OS << getOpenMPClauseName(Node->getClauseKind()); + unsigned ModifierCount = 0; + for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { + if (Node->getMotionModifier(I) != OMPC_MOTION_MODIFIER_unknown) + ++ModifierCount; + } + if (ModifierCount) { + OS << '('; + for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { + if (Node->getMotionModifier(I) != OMPC_MOTION_MODIFIER_unknown) { + OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), + Node->getMotionModifier(I)); + if (Node->getMotionModifier(I) == OMPC_MOTION_MODIFIER_mapper) + PrintMapper(OS, Node, Policy); + if (I < ModifierCount - 1) + OS << ", "; + } } - OS << ")"; + OS << ':'; + VisitOMPClauseList(Node, ' '); + } else { + VisitOMPClauseList(Node, '('); } + OS << ")"; +} + +void OMPClausePrinter::VisitOMPToClause(OMPToClause *Node) { + VisitOMPMotionClause(Node); } void OMPClausePrinter::VisitOMPFromClause(OMPFromClause *Node) { - if (!Node->varlist_empty()) { - OS << "from"; - DeclarationNameInfo MapperId = Node->getMapperIdInfo(); - if (MapperId.getName() && !MapperId.getName().isEmpty()) { - OS << '('; - OS << "mapper("; - NestedNameSpecifier *MapperNNS = - Node->getMapperQualifierLoc().getNestedNameSpecifier(); - if (MapperNNS) - MapperNNS->print(OS, Policy); - OS << MapperId << "):"; - VisitOMPClauseList(Node, ' '); - } else { - VisitOMPClauseList(Node, '('); - } - OS << ")"; - } + VisitOMPMotionClause(Node); } void OMPClausePrinter::VisitOMPDistScheduleClause(OMPDistScheduleClause *Node) { @@ -2131,9 +2140,10 @@ void OMPTraitInfo::getAsVariantMatchInfo(ASTContext &ASTCtx, Selector.ScoreOrCondition->getIntegerConstantExpr(ASTCtx)) VMI.addTrait(CondVal->isNullValue() ? TraitProperty::user_condition_false - : TraitProperty::user_condition_true); + : TraitProperty::user_condition_true, + ""); else - VMI.addTrait(TraitProperty::user_condition_false); + VMI.addTrait(TraitProperty::user_condition_false, ""); continue; } @@ -2143,11 +2153,12 @@ void OMPTraitInfo::getAsVariantMatchInfo(ASTContext &ASTCtx, if ((Score = Selector.ScoreOrCondition->getIntegerConstantExpr(ASTCtx))) ScorePtr = &*Score; else - VMI.addTrait(TraitProperty::user_condition_false); + VMI.addTrait(TraitProperty::user_condition_false, + ""); } for (const OMPTraitProperty &Property : Selector.Properties) - VMI.addTrait(Set.Kind, Property.Kind, ScorePtr); + VMI.addTrait(Set.Kind, Property.Kind, Property.RawString, ScorePtr); if (Set.Kind != TraitSet::construct) continue; @@ -2204,7 +2215,8 @@ void OMPTraitInfo::print(llvm::raw_ostream &OS, if (!FirstProperty) OS << ", "; FirstProperty = false; - OS << getOpenMPContextTraitPropertyName(Property.Kind); + OS << getOpenMPContextTraitPropertyName(Property.Kind, + Property.RawString); } } OS << ")"; @@ -2231,7 +2243,9 @@ std::string OMPTraitInfo::getMangledName() const { continue; for (const OMPTraitProperty &Property : Selector.Properties) - OS << '$' << 'P' << getOpenMPContextTraitPropertyName(Property.Kind); + OS << '$' << 'P' + << getOpenMPContextTraitPropertyName(Property.Kind, + Property.RawString); } } return OS.str(); @@ -2261,8 +2275,9 @@ OMPTraitInfo::OMPTraitInfo(StringRef MangledName) { Selector.Properties.push_back(OMPTraitProperty()); OMPTraitProperty &Property = Selector.Properties.back(); std::pair PropRestPair = MangledName.split('$'); - Property.Kind = - getOpenMPContextTraitPropertyKind(Set.Kind, PropRestPair.first); + Property.RawString = PropRestPair.first; + Property.Kind = getOpenMPContextTraitPropertyKind( + Set.Kind, Selector.Kind, PropRestPair.first); MangledName = PropRestPair.second; } while (true); } while (true); @@ -2280,3 +2295,24 @@ llvm::raw_ostream &clang::operator<<(llvm::raw_ostream &OS, const OMPTraitInfo *TI) { return TI ? OS << *TI : OS; } + +TargetOMPContext::TargetOMPContext( + ASTContext &ASTCtx, std::function &&DiagUnknownTrait, + const FunctionDecl *CurrentFunctionDecl) + : OMPContext(ASTCtx.getLangOpts().OpenMPIsDevice, + ASTCtx.getTargetInfo().getTriple()), + FeatureValidityCheck([&](StringRef FeatureName) { + return ASTCtx.getTargetInfo().isValidFeatureName(FeatureName); + }), + DiagUnknownTrait(std::move(DiagUnknownTrait)) { + ASTCtx.getFunctionFeatureMap(FeatureMap, CurrentFunctionDecl); +} + +bool TargetOMPContext::matchesISATrait(StringRef RawString) const { + auto It = FeatureMap.find(RawString); + if (It != FeatureMap.end()) + return It->second; + if (!FeatureValidityCheck(RawString)) + DiagUnknownTrait(RawString); + return false; +} diff --git a/clang/lib/AST/RecordLayout.cpp b/clang/lib/AST/RecordLayout.cpp index e7b500e1902d7..8f70a20729262 100644 --- a/clang/lib/AST/RecordLayout.cpp +++ b/clang/lib/AST/RecordLayout.cpp @@ -29,45 +29,42 @@ void ASTRecordLayout::Destroy(ASTContext &Ctx) { ASTRecordLayout::ASTRecordLayout(const ASTContext &Ctx, CharUnits size, CharUnits alignment, + CharUnits preferredAlignment, CharUnits unadjustedAlignment, CharUnits requiredAlignment, CharUnits datasize, ArrayRef fieldoffsets) : Size(size), DataSize(datasize), Alignment(alignment), + PreferredAlignment(preferredAlignment), UnadjustedAlignment(unadjustedAlignment), RequiredAlignment(requiredAlignment) { FieldOffsets.append(Ctx, fieldoffsets.begin(), fieldoffsets.end()); } // Constructor for C++ records. -ASTRecordLayout::ASTRecordLayout(const ASTContext &Ctx, - CharUnits size, CharUnits alignment, - CharUnits unadjustedAlignment, - CharUnits requiredAlignment, - bool hasOwnVFPtr, bool hasExtendableVFPtr, - CharUnits vbptroffset, - CharUnits datasize, - ArrayRef fieldoffsets, - CharUnits nonvirtualsize, - CharUnits nonvirtualalignment, - CharUnits SizeOfLargestEmptySubobject, - const CXXRecordDecl *PrimaryBase, - bool IsPrimaryBaseVirtual, - const CXXRecordDecl *BaseSharingVBPtr, - bool EndsWithZeroSizedObject, - bool LeadsWithZeroSizedBase, - const BaseOffsetsMapTy& BaseOffsets, - const VBaseOffsetsMapTy& VBaseOffsets) - : Size(size), DataSize(datasize), Alignment(alignment), - UnadjustedAlignment(unadjustedAlignment), - RequiredAlignment(requiredAlignment), CXXInfo(new (Ctx) CXXRecordLayoutInfo) -{ +ASTRecordLayout::ASTRecordLayout( + const ASTContext &Ctx, CharUnits size, CharUnits alignment, + CharUnits preferredAlignment, CharUnits unadjustedAlignment, + CharUnits requiredAlignment, bool hasOwnVFPtr, bool hasExtendableVFPtr, + CharUnits vbptroffset, CharUnits datasize, ArrayRef fieldoffsets, + CharUnits nonvirtualsize, CharUnits nonvirtualalignment, + CharUnits preferrednvalignment, CharUnits SizeOfLargestEmptySubobject, + const CXXRecordDecl *PrimaryBase, bool IsPrimaryBaseVirtual, + const CXXRecordDecl *BaseSharingVBPtr, bool EndsWithZeroSizedObject, + bool LeadsWithZeroSizedBase, const BaseOffsetsMapTy &BaseOffsets, + const VBaseOffsetsMapTy &VBaseOffsets) + : Size(size), DataSize(datasize), Alignment(alignment), + PreferredAlignment(preferredAlignment), + UnadjustedAlignment(unadjustedAlignment), + RequiredAlignment(requiredAlignment), + CXXInfo(new (Ctx) CXXRecordLayoutInfo) { FieldOffsets.append(Ctx, fieldoffsets.begin(), fieldoffsets.end()); CXXInfo->PrimaryBase.setPointer(PrimaryBase); CXXInfo->PrimaryBase.setInt(IsPrimaryBaseVirtual); CXXInfo->NonVirtualSize = nonvirtualsize; CXXInfo->NonVirtualAlignment = nonvirtualalignment; + CXXInfo->PreferredNVAlignment = preferrednvalignment; CXXInfo->SizeOfLargestEmptySubobject = SizeOfLargestEmptySubobject; CXXInfo->BaseOffsets = BaseOffsets; CXXInfo->VBaseOffsets = VBaseOffsets; diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp index 53b4c0e41106e..75bb41ca0d147 100644 --- a/clang/lib/AST/RecordLayoutBuilder.cpp +++ b/clang/lib/AST/RecordLayoutBuilder.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include "clang/AST/RecordLayout.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTDiagnostic.h" #include "clang/AST/Attr.h" @@ -16,6 +15,7 @@ #include "clang/AST/DeclObjC.h" #include "clang/AST/Expr.h" #include "clang/AST/VTableBuilder.h" +#include "clang/AST/RecordLayout.h" #include "clang/Basic/TargetInfo.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Support/Format.h" @@ -589,6 +589,9 @@ class ItaniumRecordLayoutBuilder { /// Alignment - The current alignment of the record layout. CharUnits Alignment; + /// PreferredAlignment - The preferred alignment of the record layout. + CharUnits PreferredAlignment; + /// The alignment if attribute packed is not used. CharUnits UnpackedAlignment; @@ -632,6 +635,7 @@ class ItaniumRecordLayoutBuilder { CharUnits NonVirtualSize; CharUnits NonVirtualAlignment; + CharUnits PreferredNVAlignment; /// If we've laid out a field but not included its tail padding in Size yet, /// this is the size up to the end of that field. @@ -652,6 +656,12 @@ class ItaniumRecordLayoutBuilder { /// the flag of field offset changing due to packed attribute. bool HasPackedField; + /// HandledFirstNonOverlappingEmptyField - An auxiliary field used for AIX. + /// When there are OverlappingEmptyFields existing in the aggregate, the + /// flag shows if the following first non-empty or empty-but-non-overlapping + /// field has been handled, if any. + bool HandledFirstNonOverlappingEmptyField; + typedef llvm::DenseMap BaseOffsetsMapTy; /// Bases - base classes and their offsets in the record. @@ -678,17 +688,19 @@ class ItaniumRecordLayoutBuilder { ItaniumRecordLayoutBuilder(const ASTContext &Context, EmptySubobjectMap *EmptySubobjects) : Context(Context), EmptySubobjects(EmptySubobjects), Size(0), - Alignment(CharUnits::One()), UnpackedAlignment(CharUnits::One()), - UnadjustedAlignment(CharUnits::One()), - UseExternalLayout(false), InferAlignment(false), Packed(false), - IsUnion(false), IsMac68kAlign(false), IsMsStruct(false), - UnfilledBitsInLastUnit(0), LastBitfieldTypeSize(0), - MaxFieldAlignment(CharUnits::Zero()), DataSize(0), - NonVirtualSize(CharUnits::Zero()), + Alignment(CharUnits::One()), PreferredAlignment(CharUnits::One()), + UnpackedAlignment(CharUnits::One()), + UnadjustedAlignment(CharUnits::One()), UseExternalLayout(false), + InferAlignment(false), Packed(false), IsUnion(false), + IsMac68kAlign(false), IsMsStruct(false), UnfilledBitsInLastUnit(0), + LastBitfieldTypeSize(0), MaxFieldAlignment(CharUnits::Zero()), + DataSize(0), NonVirtualSize(CharUnits::Zero()), NonVirtualAlignment(CharUnits::One()), + PreferredNVAlignment(CharUnits::One()), PaddedFieldSize(CharUnits::Zero()), PrimaryBase(nullptr), - PrimaryBaseIsVirtual(false), HasOwnVFPtr(false), - HasPackedField(false), FirstNearlyEmptyVBase(nullptr) {} + PrimaryBaseIsVirtual(false), HasOwnVFPtr(false), HasPackedField(false), + HandledFirstNonOverlappingEmptyField(false), + FirstNearlyEmptyVBase(nullptr) {} void Layout(const RecordDecl *D); void Layout(const CXXRecordDecl *D); @@ -763,9 +775,13 @@ class ItaniumRecordLayoutBuilder { /// alignment. void FinishLayout(const NamedDecl *D); - void UpdateAlignment(CharUnits NewAlignment, CharUnits UnpackedNewAlignment); + void UpdateAlignment(CharUnits NewAlignment, CharUnits UnpackedNewAlignment, + CharUnits PreferredAlignment); + void UpdateAlignment(CharUnits NewAlignment, CharUnits UnpackedNewAlignment) { + UpdateAlignment(NewAlignment, UnpackedNewAlignment, NewAlignment); + } void UpdateAlignment(CharUnits NewAlignment) { - UpdateAlignment(NewAlignment, NewAlignment); + UpdateAlignment(NewAlignment, NewAlignment, NewAlignment); } /// Retrieve the externally-supplied field offset for the given @@ -998,7 +1014,7 @@ void ItaniumRecordLayoutBuilder::EnsureVTablePointerAlignment( setSize(getSize().alignTo(BaseAlign)); // Update the alignment. - UpdateAlignment(BaseAlign, UnpackedBaseAlign); + UpdateAlignment(BaseAlign, UnpackedBaseAlign, BaseAlign); } void ItaniumRecordLayoutBuilder::LayoutNonVirtualBases( @@ -1044,6 +1060,10 @@ void ItaniumRecordLayoutBuilder::LayoutNonVirtualBases( Context.toCharUnitsFromBits(Context.getTargetInfo().getPointerAlign(0)); EnsureVTablePointerAlignment(PtrAlign); HasOwnVFPtr = true; + + assert(!IsUnion && "Unions cannot be dynamic classes."); + HandledFirstNonOverlappingEmptyField = true; + setSize(getSize() + PtrWidth); setDataSize(getSize()); } @@ -1179,9 +1199,9 @@ void ItaniumRecordLayoutBuilder::LayoutVirtualBase( CharUnits ItaniumRecordLayoutBuilder::LayoutBase(const BaseSubobjectInfo *Base) { - const ASTRecordLayout &Layout = Context.getASTRecordLayout(Base->Class); - + assert(!IsUnion && "Unions cannot have base classes."); + const ASTRecordLayout &Layout = Context.getASTRecordLayout(Base->Class); CharUnits Offset; // Query the external layout to see if it provides an offset. @@ -1193,45 +1213,77 @@ ItaniumRecordLayoutBuilder::LayoutBase(const BaseSubobjectInfo *Base) { HasExternalLayout = External.getExternalNVBaseOffset(Base->Class, Offset); } - // Clang <= 6 incorrectly applied the 'packed' attribute to base classes. - // Per GCC's documentation, it only applies to non-static data members. + auto getBaseOrPreferredBaseAlignFromUnpacked = [&](CharUnits UnpackedAlign) { + // Clang <= 6 incorrectly applied the 'packed' attribute to base classes. + // Per GCC's documentation, it only applies to non-static data members. + return (Packed && ((Context.getLangOpts().getClangABICompat() <= + LangOptions::ClangABI::Ver6) || + Context.getTargetInfo().getTriple().isPS4() || + Context.getTargetInfo().getTriple().isOSAIX())) + ? CharUnits::One() + : UnpackedAlign; + }; + CharUnits UnpackedBaseAlign = Layout.getNonVirtualAlignment(); + CharUnits UnpackedPreferredBaseAlign = Layout.getPreferredNVAlignment(); CharUnits BaseAlign = - (Packed && ((Context.getLangOpts().getClangABICompat() <= - LangOptions::ClangABI::Ver6) || - Context.getTargetInfo().getTriple().isPS4())) - ? CharUnits::One() - : UnpackedBaseAlign; + getBaseOrPreferredBaseAlignFromUnpacked(UnpackedBaseAlign); + CharUnits PreferredBaseAlign = + getBaseOrPreferredBaseAlignFromUnpacked(UnpackedPreferredBaseAlign); + + const bool DefaultsToAIXPowerAlignment = + Context.getTargetInfo().defaultsToAIXPowerAlignment(); + if (DefaultsToAIXPowerAlignment) { + // AIX `power` alignment does not apply the preferred alignment for + // non-union classes if the source of the alignment (the current base in + // this context) follows introduction of the first subobject with + // exclusively allocated space or zero-extent array. + if (!Base->Class->isEmpty() && !HandledFirstNonOverlappingEmptyField) { + // By handling a base class that is not empty, we're handling the + // "first (inherited) member". + HandledFirstNonOverlappingEmptyField = true; + } else { + UnpackedPreferredBaseAlign = UnpackedBaseAlign; + PreferredBaseAlign = BaseAlign; + } + } + CharUnits UnpackedAlignTo = !DefaultsToAIXPowerAlignment + ? UnpackedBaseAlign + : UnpackedPreferredBaseAlign; // If we have an empty base class, try to place it at offset 0. if (Base->Class->isEmpty() && (!HasExternalLayout || Offset == CharUnits::Zero()) && EmptySubobjects->CanPlaceBaseAtOffset(Base, CharUnits::Zero())) { setSize(std::max(getSize(), Layout.getSize())); - UpdateAlignment(BaseAlign, UnpackedBaseAlign); + UpdateAlignment(BaseAlign, UnpackedAlignTo, PreferredBaseAlign); return CharUnits::Zero(); } - // The maximum field alignment overrides base align. + // The maximum field alignment overrides the base align/(AIX-only) preferred + // base align. if (!MaxFieldAlignment.isZero()) { BaseAlign = std::min(BaseAlign, MaxFieldAlignment); - UnpackedBaseAlign = std::min(UnpackedBaseAlign, MaxFieldAlignment); + PreferredBaseAlign = std::min(PreferredBaseAlign, MaxFieldAlignment); + UnpackedAlignTo = std::min(UnpackedAlignTo, MaxFieldAlignment); } + CharUnits AlignTo = + !DefaultsToAIXPowerAlignment ? BaseAlign : PreferredBaseAlign; if (!HasExternalLayout) { // Round up the current record size to the base's alignment boundary. - Offset = getDataSize().alignTo(BaseAlign); + Offset = getDataSize().alignTo(AlignTo); // Try to place the base. while (!EmptySubobjects->CanPlaceBaseAtOffset(Base, Offset)) - Offset += BaseAlign; + Offset += AlignTo; } else { bool Allowed = EmptySubobjects->CanPlaceBaseAtOffset(Base, Offset); (void)Allowed; assert(Allowed && "Base subobject externally placed at overlapping offset"); - if (InferAlignment && Offset < getDataSize().alignTo(BaseAlign)) { + if (InferAlignment && Offset < getDataSize().alignTo(AlignTo)) { // The externally-supplied base offset is before the base offset we // computed. Assume that the structure is packed. Alignment = CharUnits::One(); @@ -1248,7 +1300,7 @@ ItaniumRecordLayoutBuilder::LayoutBase(const BaseSubobjectInfo *Base) { setSize(std::max(getSize(), Offset + Layout.getSize())); // Remember max struct/class alignment. - UpdateAlignment(BaseAlign, UnpackedBaseAlign); + UpdateAlignment(BaseAlign, UnpackedAlignTo, PreferredBaseAlign); return Offset; } @@ -1260,6 +1312,8 @@ void ItaniumRecordLayoutBuilder::InitializeLayout(const Decl *D) { } Packed = D->hasAttr(); + HandledFirstNonOverlappingEmptyField = + !Context.getTargetInfo().defaultsToAIXPowerAlignment(); // Honor the default struct packing maximum alignment flag. if (unsigned DefaultMaxFieldAlignment = Context.getLangOpts().PackStruct) { @@ -1274,6 +1328,7 @@ void ItaniumRecordLayoutBuilder::InitializeLayout(const Decl *D) { IsMac68kAlign = true; MaxFieldAlignment = CharUnits::fromQuantity(2); Alignment = CharUnits::fromQuantity(2); + PreferredAlignment = CharUnits::fromQuantity(2); } else { if (const MaxFieldAlignmentAttr *MFAA = D->getAttr()) MaxFieldAlignment = Context.toCharUnitsFromBits(MFAA->getAlignment()); @@ -1293,6 +1348,7 @@ void ItaniumRecordLayoutBuilder::InitializeLayout(const Decl *D) { if (UseExternalLayout) { if (External.Align > 0) { Alignment = Context.toCharUnitsFromBits(External.Align); + PreferredAlignment = Context.toCharUnitsFromBits(External.Align); } else { // The external source didn't have alignment information; infer it. InferAlignment = true; @@ -1321,6 +1377,7 @@ void ItaniumRecordLayoutBuilder::Layout(const CXXRecordDecl *RD) { NonVirtualSize = Context.toCharUnitsFromBits( llvm::alignTo(getSizeInBits(), Context.getTargetInfo().getCharAlign())); NonVirtualAlignment = Alignment; + PreferredNVAlignment = PreferredAlignment; // Lay out the virtual bases and add the primary virtual base offsets. LayoutVirtualBases(RD, RD); @@ -1733,25 +1790,46 @@ void ItaniumRecordLayoutBuilder::LayoutBitField(const FieldDecl *D) { void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, bool InsertExtraPadding) { + auto *FieldClass = D->getType()->getAsCXXRecordDecl(); + bool PotentiallyOverlapping = D->hasAttr() && FieldClass; + bool IsOverlappingEmptyField = + PotentiallyOverlapping && FieldClass->isEmpty(); + + CharUnits FieldOffset = + (IsUnion || IsOverlappingEmptyField) ? CharUnits::Zero() : getDataSize(); + + const bool DefaultsToAIXPowerAlignment = + Context.getTargetInfo().defaultsToAIXPowerAlignment(); + bool FoundFirstNonOverlappingEmptyFieldForAIX = false; + if (DefaultsToAIXPowerAlignment && !HandledFirstNonOverlappingEmptyField) { + assert(FieldOffset == CharUnits::Zero() && + "The first non-overlapping empty field should have been handled."); + + if (!IsOverlappingEmptyField) { + FoundFirstNonOverlappingEmptyFieldForAIX = true; + + // We're going to handle the "first member" based on + // `FoundFirstNonOverlappingEmptyFieldForAIX` during the current + // invocation of this function; record it as handled for future + // invocations (except for unions, because the current field does not + // represent all "firsts"). + HandledFirstNonOverlappingEmptyField = !IsUnion; + } + } + if (D->isBitField()) { LayoutBitField(D); return; } uint64_t UnpaddedFieldOffset = getDataSizeInBits() - UnfilledBitsInLastUnit; - // Reset the unfilled bits. UnfilledBitsInLastUnit = 0; LastBitfieldTypeSize = 0; - auto *FieldClass = D->getType()->getAsCXXRecordDecl(); - bool PotentiallyOverlapping = D->hasAttr() && FieldClass; - bool IsOverlappingEmptyField = PotentiallyOverlapping && FieldClass->isEmpty(); bool FieldPacked = Packed || D->hasAttr(); - CharUnits FieldOffset = (IsUnion || IsOverlappingEmptyField) - ? CharUnits::Zero() - : getDataSize(); + bool AlignIsRequired = false; CharUnits FieldSize; CharUnits FieldAlign; // The amount of this class's dsize occupied by the field. @@ -1759,25 +1837,27 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, // into the field's tail padding. CharUnits EffectiveFieldSize; + auto setDeclInfo = [&](bool IsIncompleteArrayType) { + TypeInfo TI = Context.getTypeInfo(D->getType()); + FieldAlign = Context.toCharUnitsFromBits(TI.Align); + // Flexible array members don't have any size, but they have to be + // aligned appropriately for their element type. + EffectiveFieldSize = FieldSize = + IsIncompleteArrayType ? CharUnits::Zero() + : Context.toCharUnitsFromBits(TI.Width); + AlignIsRequired = TI.AlignIsRequired; + }; + if (D->getType()->isIncompleteArrayType()) { - // This is a flexible array member; we can't directly - // query getTypeInfo about these, so we figure it out here. - // Flexible array members don't have any size, but they - // have to be aligned appropriately for their element type. - EffectiveFieldSize = FieldSize = CharUnits::Zero(); - const ArrayType* ATy = Context.getAsArrayType(D->getType()); - FieldAlign = Context.getTypeAlignInChars(ATy->getElementType()); + setDeclInfo(true /* IsIncompleteArrayType */); } else if (const ReferenceType *RT = D->getType()->getAs()) { unsigned AS = Context.getTargetAddressSpace(RT->getPointeeType()); - EffectiveFieldSize = FieldSize = - Context.toCharUnitsFromBits(Context.getTargetInfo().getPointerWidth(AS)); - FieldAlign = - Context.toCharUnitsFromBits(Context.getTargetInfo().getPointerAlign(AS)); + EffectiveFieldSize = FieldSize = Context.toCharUnitsFromBits( + Context.getTargetInfo().getPointerWidth(AS)); + FieldAlign = Context.toCharUnitsFromBits( + Context.getTargetInfo().getPointerAlign(AS)); } else { - std::pair FieldInfo = - Context.getTypeInfoInChars(D->getType()); - EffectiveFieldSize = FieldSize = FieldInfo.first; - FieldAlign = FieldInfo.second; + setDeclInfo(false /* IsIncompleteArrayType */); // A potentially-overlapping field occupies its dsize or nvsize, whichever // is larger. @@ -1829,31 +1909,72 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, } } + // The AIX `power` alignment rules apply the natural alignment of the + // "first member" if it is of a floating-point data type (or is an aggregate + // whose recursively "first" member or element is such a type). The alignment + // associated with these types for subsequent members use an alignment value + // where the floating-point data type is considered to have 4-byte alignment. + // + // For the purposes of the foregoing: vtable pointers, non-empty base classes, + // and zero-width bit-fields count as prior members; members of empty class + // types marked `no_unique_address` are not considered to be prior members. + CharUnits PreferredAlign = FieldAlign; + if (DefaultsToAIXPowerAlignment && !AlignIsRequired && + FoundFirstNonOverlappingEmptyFieldForAIX) { + auto performBuiltinTypeAlignmentUpgrade = [&](const BuiltinType *BTy) { + if (BTy->getKind() == BuiltinType::Double || + BTy->getKind() == BuiltinType::LongDouble) { + assert(PreferredAlign == CharUnits::fromQuantity(4) && + "No need to upgrade the alignment value."); + PreferredAlign = CharUnits::fromQuantity(8); + } + }; + + const Type *Ty = D->getType()->getBaseElementTypeUnsafe(); + if (const ComplexType *CTy = Ty->getAs()) { + performBuiltinTypeAlignmentUpgrade(CTy->getElementType()->castAs()); + } else if (const BuiltinType *BTy = Ty->getAs()) { + performBuiltinTypeAlignmentUpgrade(BTy); + } else if (const RecordType *RT = Ty->getAs()) { + const RecordDecl *RD = RT->getDecl(); + assert(RD && "Expected non-null RecordDecl."); + const ASTRecordLayout &FieldRecord = Context.getASTRecordLayout(RD); + PreferredAlign = FieldRecord.getPreferredAlignment(); + } + } + // The align if the field is not packed. This is to check if the attribute // was unnecessary (-Wpacked). - CharUnits UnpackedFieldAlign = FieldAlign; + CharUnits UnpackedFieldAlign = + !DefaultsToAIXPowerAlignment ? FieldAlign : PreferredAlign; CharUnits UnpackedFieldOffset = FieldOffset; - if (FieldPacked) + if (FieldPacked) { FieldAlign = CharUnits::One(); + PreferredAlign = CharUnits::One(); + } CharUnits MaxAlignmentInChars = - Context.toCharUnitsFromBits(D->getMaxAlignment()); + Context.toCharUnitsFromBits(D->getMaxAlignment()); FieldAlign = std::max(FieldAlign, MaxAlignmentInChars); + PreferredAlign = std::max(PreferredAlign, MaxAlignmentInChars); UnpackedFieldAlign = std::max(UnpackedFieldAlign, MaxAlignmentInChars); // The maximum field alignment overrides the aligned attribute. if (!MaxFieldAlignment.isZero()) { FieldAlign = std::min(FieldAlign, MaxFieldAlignment); + PreferredAlign = std::min(PreferredAlign, MaxFieldAlignment); UnpackedFieldAlign = std::min(UnpackedFieldAlign, MaxFieldAlignment); } + CharUnits AlignTo = + !DefaultsToAIXPowerAlignment ? FieldAlign : PreferredAlign; // Round up the current record size to the field's alignment boundary. - FieldOffset = FieldOffset.alignTo(FieldAlign); + FieldOffset = FieldOffset.alignTo(AlignTo); UnpackedFieldOffset = UnpackedFieldOffset.alignTo(UnpackedFieldAlign); if (UseExternalLayout) { FieldOffset = Context.toCharUnitsFromBits( - updateExternalFieldOffset(D, Context.toBits(FieldOffset))); + updateExternalFieldOffset(D, Context.toBits(FieldOffset))); if (!IsUnion && EmptySubobjects) { // Record the fact that we're placing a field at this offset. @@ -1869,9 +1990,9 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, // We try offset 0 (for an empty field) and then dsize(C) onwards. if (FieldOffset == CharUnits::Zero() && getDataSize() != CharUnits::Zero()) - FieldOffset = getDataSize().alignTo(FieldAlign); + FieldOffset = getDataSize().alignTo(AlignTo); else - FieldOffset += FieldAlign; + FieldOffset += AlignTo; } } } @@ -1908,9 +2029,9 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D, (uint64_t)Context.toBits(FieldOffset + FieldSize))); } - // Remember max struct/class alignment. + // Remember max struct/class ABI-specified alignment. UnadjustedAlignment = std::max(UnadjustedAlignment, FieldAlign); - UpdateAlignment(FieldAlign, UnpackedFieldAlign); + UpdateAlignment(FieldAlign, UnpackedFieldAlign, PreferredAlign); } void ItaniumRecordLayoutBuilder::FinishLayout(const NamedDecl *D) { @@ -1936,8 +2057,12 @@ void ItaniumRecordLayoutBuilder::FinishLayout(const NamedDecl *D) { uint64_t UnpaddedSize = getSizeInBits() - UnfilledBitsInLastUnit; uint64_t UnpackedSizeInBits = llvm::alignTo(getSizeInBits(), Context.toBits(UnpackedAlignment)); - uint64_t RoundedSize = - llvm::alignTo(getSizeInBits(), Context.toBits(Alignment)); + + uint64_t RoundedSize = llvm::alignTo( + getSizeInBits(), + Context.toBits(!Context.getTargetInfo().defaultsToAIXPowerAlignment() + ? Alignment + : PreferredAlignment)); if (UseExternalLayout) { // If we're inferring alignment, and the external size is smaller than @@ -1945,6 +2070,7 @@ void ItaniumRecordLayoutBuilder::FinishLayout(const NamedDecl *D) { // alignment to 1. if (InferAlignment && External.Size < RoundedSize) { Alignment = CharUnits::One(); + PreferredAlignment = CharUnits::One(); InferAlignment = false; } setSize(External.Size); @@ -1981,7 +2107,8 @@ void ItaniumRecordLayoutBuilder::FinishLayout(const NamedDecl *D) { } void ItaniumRecordLayoutBuilder::UpdateAlignment( - CharUnits NewAlignment, CharUnits UnpackedNewAlignment) { + CharUnits NewAlignment, CharUnits UnpackedNewAlignment, + CharUnits PreferredNewAlignment) { // The alignment is not modified when using 'mac68k' alignment or when // we have an externally-supplied layout that also provides overall alignment. if (IsMac68kAlign || (UseExternalLayout && !InferAlignment)) @@ -1998,6 +2125,12 @@ void ItaniumRecordLayoutBuilder::UpdateAlignment( "Alignment not a power of 2"); UnpackedAlignment = UnpackedNewAlignment; } + + if (PreferredNewAlignment > PreferredAlignment) { + assert(llvm::isPowerOf2_64(PreferredNewAlignment.getQuantity()) && + "Alignment not a power of 2"); + PreferredAlignment = PreferredNewAlignment; + } } uint64_t @@ -2009,6 +2142,7 @@ ItaniumRecordLayoutBuilder::updateExternalFieldOffset(const FieldDecl *Field, // The externally-supplied field offset is before the field offset we // computed. Assume that the structure is packed. Alignment = CharUnits::One(); + PreferredAlignment = CharUnits::One(); InferAlignment = false; } @@ -3067,10 +3201,10 @@ ASTContext::getASTRecordLayout(const RecordDecl *D) const { Builder.cxxLayout(RD); NewEntry = new (*this) ASTRecordLayout( *this, Builder.Size, Builder.Alignment, Builder.Alignment, - Builder.RequiredAlignment, - Builder.HasOwnVFPtr, Builder.HasOwnVFPtr || Builder.PrimaryBase, - Builder.VBPtrOffset, Builder.DataSize, Builder.FieldOffsets, - Builder.NonVirtualSize, Builder.Alignment, CharUnits::Zero(), + Builder.Alignment, Builder.RequiredAlignment, Builder.HasOwnVFPtr, + Builder.HasOwnVFPtr || Builder.PrimaryBase, Builder.VBPtrOffset, + Builder.DataSize, Builder.FieldOffsets, Builder.NonVirtualSize, + Builder.Alignment, Builder.Alignment, CharUnits::Zero(), Builder.PrimaryBase, false, Builder.SharedVBPtrBase, Builder.EndsWithZeroSizedObject, Builder.LeadsWithZeroSizedBase, Builder.Bases, Builder.VBases); @@ -3078,8 +3212,8 @@ ASTContext::getASTRecordLayout(const RecordDecl *D) const { Builder.layout(D); NewEntry = new (*this) ASTRecordLayout( *this, Builder.Size, Builder.Alignment, Builder.Alignment, - Builder.RequiredAlignment, - Builder.Size, Builder.FieldOffsets); + Builder.Alignment, Builder.RequiredAlignment, Builder.Size, + Builder.FieldOffsets); } } else { if (const auto *RD = dyn_cast(D)) { @@ -3099,11 +3233,13 @@ ASTContext::getASTRecordLayout(const RecordDecl *D) const { CharUnits NonVirtualSize = skipTailPadding ? DataSize : Builder.NonVirtualSize; NewEntry = new (*this) ASTRecordLayout( - *this, Builder.getSize(), Builder.Alignment, Builder.UnadjustedAlignment, + *this, Builder.getSize(), Builder.Alignment, + Builder.PreferredAlignment, Builder.UnadjustedAlignment, /*RequiredAlignment : used by MS-ABI)*/ Builder.Alignment, Builder.HasOwnVFPtr, RD->isDynamicClass(), CharUnits::fromQuantity(-1), DataSize, Builder.FieldOffsets, NonVirtualSize, Builder.NonVirtualAlignment, + Builder.PreferredNVAlignment, EmptySubobjects.SizeOfLargestEmptySubobject, Builder.PrimaryBase, Builder.PrimaryBaseIsVirtual, nullptr, false, false, Builder.Bases, Builder.VBases); @@ -3112,7 +3248,8 @@ ASTContext::getASTRecordLayout(const RecordDecl *D) const { Builder.Layout(D); NewEntry = new (*this) ASTRecordLayout( - *this, Builder.getSize(), Builder.Alignment, Builder.UnadjustedAlignment, + *this, Builder.getSize(), Builder.Alignment, + Builder.PreferredAlignment, Builder.UnadjustedAlignment, /*RequiredAlignment : used by MS-ABI)*/ Builder.Alignment, Builder.getSize(), Builder.FieldOffsets); } @@ -3264,14 +3401,11 @@ ASTContext::getObjCLayout(const ObjCInterfaceDecl *D, ItaniumRecordLayoutBuilder Builder(*this, /*EmptySubobjects=*/nullptr); Builder.Layout(D); - const ASTRecordLayout *NewEntry = - new (*this) ASTRecordLayout(*this, Builder.getSize(), - Builder.Alignment, - Builder.UnadjustedAlignment, - /*RequiredAlignment : used by MS-ABI)*/ - Builder.Alignment, - Builder.getDataSize(), - Builder.FieldOffsets); + const ASTRecordLayout *NewEntry = new (*this) ASTRecordLayout( + *this, Builder.getSize(), Builder.Alignment, Builder.PreferredAlignment, + Builder.UnadjustedAlignment, + /*RequiredAlignment : used by MS-ABI)*/ + Builder.Alignment, Builder.getDataSize(), Builder.FieldOffsets); ObjCLayouts[Key] = NewEntry; @@ -3434,22 +3568,26 @@ static void DumpRecordLayout(raw_ostream &OS, const RecordDecl *RD, if (CXXRD && !isMsLayout(C)) OS << ", dsize=" << Layout.getDataSize().getQuantity(); OS << ", align=" << Layout.getAlignment().getQuantity(); + if (C.getTargetInfo().defaultsToAIXPowerAlignment()) + OS << ", preferredalign=" << Layout.getPreferredAlignment().getQuantity(); if (CXXRD) { OS << ",\n"; PrintIndentNoOffset(OS, IndentLevel - 1); OS << " nvsize=" << Layout.getNonVirtualSize().getQuantity(); OS << ", nvalign=" << Layout.getNonVirtualAlignment().getQuantity(); + if (C.getTargetInfo().defaultsToAIXPowerAlignment()) + OS << ", preferrednvalign=" + << Layout.getPreferredNVAlignment().getQuantity(); } OS << "]\n"; } -void ASTContext::DumpRecordLayout(const RecordDecl *RD, - raw_ostream &OS, +void ASTContext::DumpRecordLayout(const RecordDecl *RD, raw_ostream &OS, bool Simple) const { if (!Simple) { ::DumpRecordLayout(OS, RD, *this, CharUnits(), 0, nullptr, - /*PrintSizeInfo*/true, + /*PrintSizeInfo*/ true, /*IncludeVirtualBases=*/true); return; } @@ -3469,9 +3607,13 @@ void ASTContext::DumpRecordLayout(const RecordDecl *RD, if (!isMsLayout(*this)) OS << " DataSize:" << toBits(Info.getDataSize()) << "\n"; OS << " Alignment:" << toBits(Info.getAlignment()) << "\n"; + if (Target->defaultsToAIXPowerAlignment()) + OS << " PreferredAlignment:" << toBits(Info.getPreferredAlignment()) + << "\n"; OS << " FieldOffsets: ["; for (unsigned i = 0, e = Info.getFieldCount(); i != e; ++i) { - if (i) OS << ", "; + if (i) + OS << ", "; OS << Info.getFieldOffset(i); } OS << "]>\n"; diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 91b984820cd22..5b6c6085e02cf 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1994,7 +1994,7 @@ void TextNodeDumper::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *D) { dumpBareDeclRef(TC->getFoundDecl()); OS << ")"; } - Visit(TC->getImmediatelyDeclaredConstraint()); + AddChild([=] { Visit(TC->getImmediatelyDeclaredConstraint()); }); } else if (D->wasDeclaredWithTypename()) OS << " typename"; else diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 6edf367a4fa4d..0dfe9c29c600e 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -1187,9 +1187,6 @@ struct SimpleTransformVisitor : public TypeVisitor { T->getTypeConstraintArguments()); } - // FIXME: Non-trivial to implement, but important for C++ - SUGARED_TYPE_CLASS(PackExpansion) - QualType VisitObjCObjectType(const ObjCObjectType *T) { QualType baseType = recurse(T->getBaseType()); if (baseType.isNull()) @@ -3354,6 +3351,12 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, getExtProtoInfo(), Ctx, isCanonicalUnqualified()); } +TypedefType::TypedefType(TypeClass tc, const TypedefNameDecl *D, QualType can) + : Type(tc, can, D->getUnderlyingType()->getDependence()), + Decl(const_cast(D)) { + assert(!isa(can) && "Invalid canonical type"); +} + QualType TypedefType::desugar() const { return getDecl()->getUnderlyingType(); } diff --git a/clang/lib/ASTMatchers/Dynamic/CMakeLists.txt b/clang/lib/ASTMatchers/Dynamic/CMakeLists.txt index adb891575b308..7110a503869e6 100644 --- a/clang/lib/ASTMatchers/Dynamic/CMakeLists.txt +++ b/clang/lib/ASTMatchers/Dynamic/CMakeLists.txt @@ -3,15 +3,6 @@ set(LLVM_LINK_COMPONENTS Support ) -# The registry source file ends up generating a lot of sections for each -# matcher. Each matcher appears to get a vtable and several methods. Each -# method needs .text, .pdata, .xdata, and .debug sections, adding to the -# section multiplier. By default MSVC has a 2^16 limit on the number of -# sections in an object file, and this needs more than that. -if (MSVC) - set_source_files_properties(Registry.cpp PROPERTIES COMPILE_FLAGS /bigobj) -endif() - add_clang_library(clangDynamicASTMatchers Diagnostics.cpp Marshallers.cpp diff --git a/clang/lib/Analysis/PathDiagnostic.cpp b/clang/lib/Analysis/PathDiagnostic.cpp index 9aa3386129d7f..f80b99b998063 100644 --- a/clang/lib/Analysis/PathDiagnostic.cpp +++ b/clang/lib/Analysis/PathDiagnostic.cpp @@ -1134,7 +1134,6 @@ void PathDiagnosticPopUpPiece::Profile(llvm::FoldingSetNodeID &ID) const { void PathDiagnostic::Profile(llvm::FoldingSetNodeID &ID) const { ID.Add(getLocation()); ID.Add(getUniqueingLoc()); - ID.AddPointer(getUniqueingLoc().isValid() ? getUniqueingDecl() : nullptr); ID.AddString(BugType); ID.AddString(VerboseDesc); ID.AddString(Category); diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index 4807702e896e8..5c19d60cbd6e3 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -64,17 +64,16 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str, return Type; } case OMPC_to: - return llvm::StringSwitch(Str) -#define OPENMP_TO_MODIFIER_KIND(Name) \ - .Case(#Name, static_cast(OMPC_TO_MODIFIER_##Name)) -#include "clang/Basic/OpenMPKinds.def" - .Default(OMPC_TO_MODIFIER_unknown); - case OMPC_from: - return llvm::StringSwitch(Str) -#define OPENMP_FROM_MODIFIER_KIND(Name) \ - .Case(#Name, static_cast(OMPC_FROM_MODIFIER_##Name)) + case OMPC_from: { + unsigned Type = llvm::StringSwitch(Str) +#define OPENMP_MOTION_MODIFIER_KIND(Name) \ + .Case(#Name, static_cast(OMPC_MOTION_MODIFIER_##Name)) #include "clang/Basic/OpenMPKinds.def" - .Default(OMPC_FROM_MODIFIER_unknown); + .Default(OMPC_MOTION_MODIFIER_unknown); + if (OpenMPVersion < 51 && Type == OMPC_MOTION_MODIFIER_present) + return OMPC_MOTION_MODIFIER_unknown; + return Type; + } case OMPC_dist_schedule: return llvm::StringSwitch(Str) #define OPENMP_DIST_SCHEDULE_KIND(Name) .Case(#Name, OMPC_DIST_SCHEDULE_##Name) @@ -258,29 +257,18 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, } llvm_unreachable("Invalid OpenMP 'map' clause type"); case OMPC_to: - switch (Type) { - case OMPC_TO_MODIFIER_unknown: - return "unknown"; -#define OPENMP_TO_MODIFIER_KIND(Name) \ - case OMPC_TO_MODIFIER_##Name: \ - return #Name; -#include "clang/Basic/OpenMPKinds.def" - default: - break; - } - llvm_unreachable("Invalid OpenMP 'to' clause type"); case OMPC_from: switch (Type) { - case OMPC_FROM_MODIFIER_unknown: + case OMPC_MOTION_MODIFIER_unknown: return "unknown"; -#define OPENMP_FROM_MODIFIER_KIND(Name) \ - case OMPC_FROM_MODIFIER_##Name: \ +#define OPENMP_MOTION_MODIFIER_KIND(Name) \ + case OMPC_MOTION_MODIFIER_##Name: \ return #Name; #include "clang/Basic/OpenMPKinds.def" default: break; } - llvm_unreachable("Invalid OpenMP 'from' clause type"); + llvm_unreachable("Invalid OpenMP 'to' or 'from' clause type"); case OMPC_dist_schedule: switch (Type) { case OMPC_DIST_SCHEDULE_unknown: diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h index cfa362bef1b1c..7b3acc335a352 100644 --- a/clang/lib/Basic/Targets/OSTargets.h +++ b/clang/lib/Basic/Targets/OSTargets.h @@ -719,6 +719,8 @@ class AIXTargetInfo : public OSTargetInfo { // AIX sets FLT_EVAL_METHOD to be 1. unsigned getFloatEvalMethod() const override { return 1; } bool hasInt128Type() const override { return false; } + + bool defaultsToAIXPowerAlignment() const override { return true; } }; void addWindowsDefines(const llvm::Triple &Triple, const LangOptions &Opts, diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp index f0de2bf070ea4..c5ad4a5a2d270 100644 --- a/clang/lib/Basic/Targets/PPC.cpp +++ b/clang/lib/Basic/Targets/PPC.cpp @@ -46,8 +46,6 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector &Features, HasP8Crypto = true; } else if (Feature == "+direct-move") { HasDirectMove = true; - } else if (Feature == "+qpx") { - HasQPX = true; } else if (Feature == "+htm") { HasHTM = true; } else if (Feature == "+float128") { @@ -64,6 +62,8 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector &Features, LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } else if (Feature == "-hard-float") { FloatABI = SoftFloat; + } else if (Feature == "+paired-vector-memops") { + PairedVectorMemops = true; } // TODO: Finish this list and add an assert that we've handled them // all. @@ -99,7 +99,7 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, } // ABI options. - if (ABI == "elfv1" || ABI == "elfv1-qpx") + if (ABI == "elfv1") Builder.defineMacro("_CALL_ELF", "1"); if (ABI == "elfv2") Builder.defineMacro("_CALL_ELF", "2"); @@ -159,22 +159,11 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("_ARCH_PWR10"); if (ArchDefs & ArchDefineA2) Builder.defineMacro("_ARCH_A2"); - if (ArchDefs & ArchDefineA2q) { - Builder.defineMacro("_ARCH_A2Q"); - Builder.defineMacro("_ARCH_QP"); - } if (ArchDefs & ArchDefineE500) Builder.defineMacro("__NO_LWSYNC__"); if (ArchDefs & ArchDefineFuture) Builder.defineMacro("_ARCH_PWR_FUTURE"); - if (getTriple().getVendor() == llvm::Triple::BGQ) { - Builder.defineMacro("__bg__"); - Builder.defineMacro("__THW_BLUEGENE__"); - Builder.defineMacro("__bgq__"); - Builder.defineMacro("__TOS_BGQ__"); - } - if (HasAltivec) { Builder.defineMacro("__VEC__", "10206"); Builder.defineMacro("__ALTIVEC__"); @@ -231,6 +220,7 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts, // - direct-move // - float128 // - power9-vector +// - paired-vector-memops // - power10-vector // then go ahead and error since the customer has expressed an incompatible // set of options. @@ -253,6 +243,7 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags, Found |= FindVSXSubfeature("+direct-move", "-mdirect-move"); Found |= FindVSXSubfeature("+float128", "-mfloat128"); Found |= FindVSXSubfeature("+power9-vector", "-mpower9-vector"); + Found |= FindVSXSubfeature("+paired-vector-memops", "-mpaired-vector-memops"); Found |= FindVSXSubfeature("+power10-vector", "-mpower10-vector"); // Return false if any vsx subfeatures was found. @@ -277,7 +268,6 @@ bool PPCTargetInfo::initFeatureMap( .Case("ppc64le", true) .Default(false); - Features["qpx"] = (CPU == "a2q"); Features["power9-vector"] = (CPU == "pwr9"); Features["crypto"] = llvm::StringSwitch(CPU) .Case("ppc64le", true) @@ -354,6 +344,7 @@ bool PPCTargetInfo::initFeatureMap( void PPCTargetInfo::addP10SpecificFeatures( llvm::StringMap &Features) const { Features["htm"] = false; // HTM was removed for P10. + Features["paired-vector-memops"] = true; Features["power10-vector"] = true; Features["pcrelative-memops"] = true; return; @@ -373,12 +364,12 @@ bool PPCTargetInfo::hasFeature(StringRef Feature) const { .Case("power8-vector", HasP8Vector) .Case("crypto", HasP8Crypto) .Case("direct-move", HasDirectMove) - .Case("qpx", HasQPX) .Case("htm", HasHTM) .Case("bpermd", HasBPERMD) .Case("extdiv", HasExtDiv) .Case("float128", HasFloat128) .Case("power9-vector", HasP9Vector) + .Case("paired-vector-memops", PairedVectorMemops) .Case("power10-vector", HasP10Vector) .Case("pcrelative-memops", HasPCRelativeMemops) .Case("spe", HasSPE) @@ -395,6 +386,7 @@ void PPCTargetInfo::setFeatureEnabled(llvm::StringMap &Features, .Case("direct-move", true) .Case("power8-vector", true) .Case("power9-vector", true) + .Case("paired-vector-memops", true) .Case("power10-vector", true) .Case("float128", true) .Default(false); @@ -414,11 +406,13 @@ void PPCTargetInfo::setFeatureEnabled(llvm::StringMap &Features, if ((Name == "altivec") || (Name == "vsx")) Features["vsx"] = Features["direct-move"] = Features["power8-vector"] = Features["float128"] = Features["power9-vector"] = - Features["power10-vector"] = false; + Features["paired-vector-memops"] = Features["power10-vector"] = + false; if (Name == "power8-vector") - Features["power9-vector"] = Features["power10-vector"] = false; + Features["power9-vector"] = Features["paired-vector-memops"] = + Features["power10-vector"] = false; else if (Name == "power9-vector") - Features["power10-vector"] = false; + Features["paired-vector-memops"] = Features["power10-vector"] = false; if (Name == "pcrel") Features["pcrelative-memops"] = false; else @@ -503,17 +497,17 @@ ArrayRef PPCTargetInfo::getGCCAddlRegNames() const { } static constexpr llvm::StringLiteral ValidCPUNames[] = { - {"generic"}, {"440"}, {"450"}, {"601"}, {"602"}, - {"603"}, {"603e"}, {"603ev"}, {"604"}, {"604e"}, - {"620"}, {"630"}, {"g3"}, {"7400"}, {"g4"}, - {"7450"}, {"g4+"}, {"750"}, {"8548"}, {"970"}, - {"g5"}, {"a2"}, {"a2q"}, {"e500"}, {"e500mc"}, - {"e5500"}, {"power3"}, {"pwr3"}, {"power4"}, {"pwr4"}, - {"power5"}, {"pwr5"}, {"power5x"}, {"pwr5x"}, {"power6"}, - {"pwr6"}, {"power6x"}, {"pwr6x"}, {"power7"}, {"pwr7"}, - {"power8"}, {"pwr8"}, {"power9"}, {"pwr9"}, {"power10"}, - {"pwr10"}, {"powerpc"}, {"ppc"}, {"powerpc64"}, {"ppc64"}, - {"powerpc64le"}, {"ppc64le"}, {"future"}}; + {"generic"}, {"440"}, {"450"}, {"601"}, {"602"}, + {"603"}, {"603e"}, {"603ev"}, {"604"}, {"604e"}, + {"620"}, {"630"}, {"g3"}, {"7400"}, {"g4"}, + {"7450"}, {"g4+"}, {"750"}, {"8548"}, {"970"}, + {"g5"}, {"a2"}, {"e500"}, {"e500mc"}, {"e5500"}, + {"power3"}, {"pwr3"}, {"power4"}, {"pwr4"}, {"power5"}, + {"pwr5"}, {"power5x"}, {"pwr5x"}, {"power6"}, {"pwr6"}, + {"power6x"}, {"pwr6x"}, {"power7"}, {"pwr7"}, {"power8"}, + {"pwr8"}, {"power9"}, {"pwr9"}, {"power10"}, {"pwr10"}, + {"powerpc"}, {"ppc"}, {"powerpc64"}, {"ppc64"}, {"powerpc64le"}, + {"ppc64le"}, {"future"}}; bool PPCTargetInfo::isValidCPUName(StringRef Name) const { return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames); diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 858059bacb86b..da4d37bbdcfc8 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -46,7 +46,6 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { ArchDefinePwr10 = 1 << 14, ArchDefineFuture = 1 << 15, ArchDefineA2 = 1 << 16, - ArchDefineA2q = 1 << 17, ArchDefineE500 = 1 << 18 } ArchDefineTypes; @@ -63,12 +62,12 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { bool HasP8Vector = false; bool HasP8Crypto = false; bool HasDirectMove = false; - bool HasQPX = false; bool HasHTM = false; bool HasBPERMD = false; bool HasExtDiv = false; bool HasP9Vector = false; bool HasSPE = false; + bool PairedVectorMemops = false; bool HasP10Vector = false; bool HasPCRelativeMemops = false; @@ -118,7 +117,6 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { .Case("970", ArchDefineName | ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) .Case("a2", ArchDefineA2) - .Case("a2q", ArchDefineName | ArchDefineA2 | ArchDefineA2q) .Cases("power3", "pwr3", ArchDefinePpcgr) .Cases("power4", "pwr4", ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq) @@ -371,13 +369,16 @@ class LLVM_LIBRARY_VISIBILITY PPC32TargetInfo : public PPCTargetInfo { PtrDiffType = SignedLong; IntPtrType = SignedLong; SuitableAlign = 64; + LongDoubleWidth = 64; + LongDoubleAlign = DoubleAlign = 32; + LongDoubleFormat = &llvm::APFloat::IEEEdouble(); break; default: break; } if (Triple.isOSFreeBSD() || Triple.isOSNetBSD() || Triple.isOSOpenBSD() || - Triple.getOS() == llvm::Triple::AIX || Triple.isMusl()) { + Triple.isMusl()) { LongDoubleWidth = LongDoubleAlign = 64; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } @@ -406,6 +407,9 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo { // TODO: Set appropriate ABI for AIX platform. resetDataLayout("E-m:a-i64:64-n32:64"); SuitableAlign = 64; + LongDoubleWidth = 64; + LongDoubleAlign = DoubleAlign = 32; + LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } else if ((Triple.getArch() == llvm::Triple::ppc64le)) { resetDataLayout("e-m:e-i64:64-n32:64"); ABI = "elfv2"; @@ -414,8 +418,7 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo { ABI = "elfv1"; } - if (Triple.isOSFreeBSD() || Triple.getOS() == llvm::Triple::AIX || - Triple.isMusl()) { + if (Triple.isOSFreeBSD() || Triple.isMusl()) { LongDoubleWidth = LongDoubleAlign = 64; LongDoubleFormat = &llvm::APFloat::IEEEdouble(); } diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index aaace69a65b9b..a6f8ded11c0a4 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -351,7 +351,8 @@ static void addDataFlowSanitizerPass(const PassManagerBuilder &Builder, const PassManagerBuilderWrapper &BuilderWrapper = static_cast(Builder); const LangOptions &LangOpts = BuilderWrapper.getLangOpts(); - PM.add(createDataFlowSanitizerPass(LangOpts.SanitizerBlacklistFiles)); + PM.add( + createDataFlowSanitizerLegacyPassPass(LangOpts.SanitizerBlacklistFiles)); } static TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple, diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 4800c9adcd38d..3c29b7f62b920 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5257,6 +5257,8 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), + NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0), NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), @@ -5274,6 +5276,8 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), + NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0), NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType), @@ -5432,6 +5436,10 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), + NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), + NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), + NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), @@ -9001,21 +9009,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Value *Ptr = Builder.CreateBitCast(Ops[0], Int128PTy); return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr); } - case NEON::BI__builtin_neon_vcvts_u32_f32: - case NEON::BI__builtin_neon_vcvtd_u64_f64: - usgn = true; - LLVM_FALLTHROUGH; - case NEON::BI__builtin_neon_vcvts_s32_f32: - case NEON::BI__builtin_neon_vcvtd_s64_f64: { - Ops.push_back(EmitScalarExpr(E->getArg(0))); - bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64; - llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty; - llvm::Type *FTy = Is64 ? DoubleTy : FloatTy; - Ops[0] = Builder.CreateBitCast(Ops[0], FTy); - if (usgn) - return Builder.CreateFPToUI(Ops[0], InTy); - return Builder.CreateFPToSI(Ops[0], InTy); - } case NEON::BI__builtin_neon_vcvts_f32_u32: case NEON::BI__builtin_neon_vcvtd_f64_u64: usgn = true; @@ -9053,44 +9046,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return Builder.CreateUIToFP(Ops[0], FTy); return Builder.CreateSIToFP(Ops[0], FTy); } - case NEON::BI__builtin_neon_vcvth_u16_f16: - usgn = true; - LLVM_FALLTHROUGH; - case NEON::BI__builtin_neon_vcvth_s16_f16: { - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); - if (usgn) - return Builder.CreateFPToUI(Ops[0], Int16Ty); - return Builder.CreateFPToSI(Ops[0], Int16Ty); - } - case NEON::BI__builtin_neon_vcvth_u32_f16: - usgn = true; - LLVM_FALLTHROUGH; - case NEON::BI__builtin_neon_vcvth_s32_f16: { - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); - if (usgn) - return Builder.CreateFPToUI(Ops[0], Int32Ty); - return Builder.CreateFPToSI(Ops[0], Int32Ty); - } - case NEON::BI__builtin_neon_vcvth_u64_f16: - usgn = true; - LLVM_FALLTHROUGH; - case NEON::BI__builtin_neon_vcvth_s64_f16: { - Ops.push_back(EmitScalarExpr(E->getArg(0))); - Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); - if (usgn) - return Builder.CreateFPToUI(Ops[0], Int64Ty); - return Builder.CreateFPToSI(Ops[0], Int64Ty); - } case NEON::BI__builtin_neon_vcvtah_u16_f16: case NEON::BI__builtin_neon_vcvtmh_u16_f16: case NEON::BI__builtin_neon_vcvtnh_u16_f16: case NEON::BI__builtin_neon_vcvtph_u16_f16: + case NEON::BI__builtin_neon_vcvth_u16_f16: case NEON::BI__builtin_neon_vcvtah_s16_f16: case NEON::BI__builtin_neon_vcvtmh_s16_f16: case NEON::BI__builtin_neon_vcvtnh_s16_f16: - case NEON::BI__builtin_neon_vcvtph_s16_f16: { + case NEON::BI__builtin_neon_vcvtph_s16_f16: + case NEON::BI__builtin_neon_vcvth_s16_f16: { unsigned Int; llvm::Type* InTy = Int32Ty; llvm::Type* FTy = HalfTy; @@ -9106,6 +9071,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_fcvtnu; break; case NEON::BI__builtin_neon_vcvtph_u16_f16: Int = Intrinsic::aarch64_neon_fcvtpu; break; + case NEON::BI__builtin_neon_vcvth_u16_f16: + Int = Intrinsic::aarch64_neon_fcvtzu; break; case NEON::BI__builtin_neon_vcvtah_s16_f16: Int = Intrinsic::aarch64_neon_fcvtas; break; case NEON::BI__builtin_neon_vcvtmh_s16_f16: @@ -9114,6 +9081,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_fcvtns; break; case NEON::BI__builtin_neon_vcvtph_s16_f16: Int = Intrinsic::aarch64_neon_fcvtps; break; + case NEON::BI__builtin_neon_vcvth_s16_f16: + Int = Intrinsic::aarch64_neon_fcvtzs; break; } Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt"); return Builder.CreateTrunc(Ops[0], Int16Ty); @@ -10154,10 +10123,10 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vcvtq_u64_v: case NEON::BI__builtin_neon_vcvtq_s16_v: case NEON::BI__builtin_neon_vcvtq_u16_v: { - Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type)); - if (usgn) - return Builder.CreateFPToUI(Ops[0], Ty); - return Builder.CreateFPToSI(Ops[0], Ty); + Int = + usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs; + llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)}; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz"); } case NEON::BI__builtin_neon_vcvta_s16_v: case NEON::BI__builtin_neon_vcvta_u16_v: @@ -16534,39 +16503,15 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()}); return Builder.CreateCall(Callee, {Low, High}); } - case WebAssembly::BI__builtin_wasm_widen_low_s_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_high_s_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_low_u_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_high_u_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_low_s_i32x4_i16x8: - case WebAssembly::BI__builtin_wasm_widen_high_s_i32x4_i16x8: - case WebAssembly::BI__builtin_wasm_widen_low_u_i32x4_i16x8: - case WebAssembly::BI__builtin_wasm_widen_high_u_i32x4_i16x8: { - Value *Vec = EmitScalarExpr(E->getArg(0)); - unsigned IntNo; - switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_widen_low_s_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_low_s_i32x4_i16x8: - IntNo = Intrinsic::wasm_widen_low_signed; - break; - case WebAssembly::BI__builtin_wasm_widen_high_s_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_high_s_i32x4_i16x8: - IntNo = Intrinsic::wasm_widen_high_signed; - break; - case WebAssembly::BI__builtin_wasm_widen_low_u_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_low_u_i32x4_i16x8: - IntNo = Intrinsic::wasm_widen_low_unsigned; - break; - case WebAssembly::BI__builtin_wasm_widen_high_u_i16x8_i8x16: - case WebAssembly::BI__builtin_wasm_widen_high_u_i32x4_i16x8: - IntNo = Intrinsic::wasm_widen_high_unsigned; - break; - default: - llvm_unreachable("unexpected builtin ID"); - } - Function *Callee = - CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Vec->getType()}); - return Builder.CreateCall(Callee, Vec); + case WebAssembly::BI__builtin_wasm_load32_zero: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load32_zero); + return Builder.CreateCall(Callee, {Ptr}); + } + case WebAssembly::BI__builtin_wasm_load64_zero: { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load64_zero); + return Builder.CreateCall(Callee, {Ptr}); } case WebAssembly::BI__builtin_wasm_shuffle_v8x16: { Value *Ops[18]; diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index a62829cee5323..4c8216b2852fa 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -2303,12 +2303,19 @@ static bool shouldOmitDefinition(codegenoptions::DebugInfoKind DebugKind, // In constructor debug mode, only emit debug info for a class when its // constructor is emitted. Skip this optimization if the class or any of // its methods are marked dllimport. + // + // This applies to classes that don't have any trivial constructors and have + // at least one constructor. if (DebugKind == codegenoptions::DebugInfoConstructor && !CXXDecl->isLambda() && !CXXDecl->hasConstexprNonCopyMoveConstructor() && - !isClassOrMethodDLLImport(CXXDecl)) + !isClassOrMethodDLLImport(CXXDecl)) { + if (CXXDecl->ctors().empty()) + return false; for (const auto *Ctor : CXXDecl->ctors()) - if (Ctor->isUserProvided()) - return true; + if (Ctor->isTrivial() && !Ctor->isCopyOrMoveConstructor()) + return false; + return true; + } TemplateSpecializationKind Spec = TSK_Undeclared; if (const auto *SD = dyn_cast(RD)) @@ -3259,7 +3266,6 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) { case Type::TypeOf: case Type::Decltype: case Type::UnaryTransform: - case Type::PackExpansion: break; } diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 96abefd384f2c..4a29dbf7f8953 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -126,8 +126,13 @@ Address CodeGenFunction::CreateDefaultAlignTempAlloca(llvm::Type *Ty, } void CodeGenFunction::InitTempAlloca(Address Var, llvm::Value *Init) { - assert(isa(Var.getPointer())); - auto *Store = new llvm::StoreInst(Init, Var.getPointer(), /*volatile*/ false, + auto *Alloca = Var.getPointer(); + assert(isa(Alloca) || + (isa(Alloca) && + isa( + cast(Alloca)->getPointerOperand()))); + + auto *Store = new llvm::StoreInst(Init, Alloca, /*volatile*/ false, Var.getAlignment().getAsAlign()); llvm::BasicBlock *Block = AllocaInsertPt->getParent(); Block->getInstList().insertAfter(AllocaInsertPt->getIterator(), Store); diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index d59aa6ce0fb94..d0e0c7d6c0603 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -1788,11 +1788,14 @@ void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD, DeleteArgs.add(RValue::get(DeletePtr), ArgTy); // Pass the std::destroying_delete tag if present. + llvm::AllocaInst *DestroyingDeleteTag = nullptr; if (Params.DestroyingDelete) { QualType DDTag = *ParamTypeIt++; - // Just pass an 'undef'. We expect the tag type to be an empty struct. - auto *V = llvm::UndefValue::get(getTypes().ConvertType(DDTag)); - DeleteArgs.add(RValue::get(V), DDTag); + llvm::Type *Ty = getTypes().ConvertType(DDTag); + CharUnits Align = CGM.getNaturalTypeAlignment(DDTag); + DestroyingDeleteTag = CreateTempAlloca(Ty, "destroying.delete.tag"); + DestroyingDeleteTag->setAlignment(Align.getAsAlign()); + DeleteArgs.add(RValue::getAggregate(Address(DestroyingDeleteTag, Align)), DDTag); } // Pass the size if the delete function has a size_t parameter. @@ -1829,6 +1832,11 @@ void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD, // Emit the call to delete. EmitNewDeleteCall(*this, DeleteFD, DeleteFTy, DeleteArgs); + + // If call argument lowering didn't use the destroying_delete_t alloca, + // remove it again. + if (DestroyingDeleteTag && DestroyingDeleteTag->use_empty()) + DestroyingDeleteTag->eraseFromParent(); } namespace { diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp index cd2b84f5dd203..26dfb6259a290 100644 --- a/clang/lib/CodeGen/CGObjC.cpp +++ b/clang/lib/CodeGen/CGObjC.cpp @@ -2250,8 +2250,7 @@ llvm::Value * CodeGenFunction::EmitARCRetainAutoreleasedReturnValue(llvm::Value *value) { emitAutoreleasedReturnValueMarker(*this); llvm::CallInst::TailCallKind tailKind = - CGM.getTargetCodeGenInfo() - .shouldSuppressTailCallsOfRetainAutoreleasedReturnValue() + CGM.getTargetCodeGenInfo().markARCOptimizedReturnCallsAsNoTail() ? llvm::CallInst::TCK_NoTail : llvm::CallInst::TCK_None; return emitARCValueOperation( @@ -2270,9 +2269,14 @@ CodeGenFunction::EmitARCRetainAutoreleasedReturnValue(llvm::Value *value) { llvm::Value * CodeGenFunction::EmitARCUnsafeClaimAutoreleasedReturnValue(llvm::Value *value) { emitAutoreleasedReturnValueMarker(*this); - return emitARCValueOperation(*this, value, nullptr, - CGM.getObjCEntrypoints().objc_unsafeClaimAutoreleasedReturnValue, - llvm::Intrinsic::objc_unsafeClaimAutoreleasedReturnValue); + llvm::CallInst::TailCallKind tailKind = + CGM.getTargetCodeGenInfo().markARCOptimizedReturnCallsAsNoTail() + ? llvm::CallInst::TCK_NoTail + : llvm::CallInst::TCK_None; + return emitARCValueOperation( + *this, value, nullptr, + CGM.getObjCEntrypoints().objc_unsafeClaimAutoreleasedReturnValue, + llvm::Intrinsic::objc_unsafeClaimAutoreleasedReturnValue, tailKind); } /// Release the given object. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index f16f2ed89de89..60c7081b135bf 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1455,6 +1455,19 @@ void CGOpenMPRuntime::clearLocThreadIdInsertPt(CodeGenFunction &CGF) { } } +static StringRef getIdentStringFromSourceLocation(CodeGenFunction &CGF, + SourceLocation Loc, + SmallString<128> &Buffer) { + llvm::raw_svector_ostream OS(Buffer); + // Build debug location + PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc); + OS << ";" << PLoc.getFilename() << ";"; + if (const auto *FD = dyn_cast_or_null(CGF.CurFuncDecl)) + OS << FD->getQualifiedNameAsString(); + OS << ";" << PLoc.getLine() << ";" << PLoc.getColumn() << ";;"; + return OS.str(); +} + llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags) { @@ -1464,6 +1477,16 @@ llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF, Loc.isInvalid()) return getOrCreateDefaultLocation(Flags).getPointer(); + // If the OpenMPIRBuilder is used we need to use it for all location handling + // as the clang invariants used below might be broken. + if (CGM.getLangOpts().OpenMPIRBuilder) { + SmallString<128> Buffer; + OMPBuilder.updateToLocation(CGF.Builder.saveIP()); + auto *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr( + getIdentStringFromSourceLocation(CGF, Loc, Buffer)); + return OMPBuilder.getOrCreateIdent(SrcLocStr, IdentFlag(Flags)); + } + assert(CGF.CurFn && "No function in current CodeGenFunction."); CharUnits Align = CGM.getContext().getTypeAlignInChars(IdentQTy); @@ -1497,15 +1520,9 @@ llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF, llvm::Value *OMPDebugLoc = OpenMPDebugLocMap.lookup(Loc.getRawEncoding()); if (OMPDebugLoc == nullptr) { - SmallString<128> Buffer2; - llvm::raw_svector_ostream OS2(Buffer2); - // Build debug location - PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc); - OS2 << ";" << PLoc.getFilename() << ";"; - if (const auto *FD = dyn_cast_or_null(CGF.CurFuncDecl)) - OS2 << FD->getQualifiedNameAsString(); - OS2 << ";" << PLoc.getLine() << ";" << PLoc.getColumn() << ";;"; - OMPDebugLoc = CGF.Builder.CreateGlobalStringPtr(OS2.str()); + SmallString<128> Buffer; + OMPDebugLoc = CGF.Builder.CreateGlobalStringPtr( + getIdentStringFromSourceLocation(CGF, Loc, Buffer)); OpenMPDebugLocMap[Loc.getRawEncoding()] = OMPDebugLoc; } // *psource = ";;;;;;"; @@ -1519,6 +1536,16 @@ llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF, llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF, SourceLocation Loc) { assert(CGF.CurFn && "No function in current CodeGenFunction."); + // If the OpenMPIRBuilder is used we need to use it for all thread id calls as + // the clang invariants used below might be broken. + if (CGM.getLangOpts().OpenMPIRBuilder) { + SmallString<128> Buffer; + OMPBuilder.updateToLocation(CGF.Builder.saveIP()); + auto *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr( + getIdentStringFromSourceLocation(CGF, Loc, Buffer)); + return OMPBuilder.getOrCreateThreadID( + OMPBuilder.getOrCreateIdent(SrcLocStr)); + } llvm::Value *ThreadID = nullptr; // Check whether we've already cached a load of the thread id in this @@ -7122,6 +7149,7 @@ class MappableExprsHandler { OMPClauseMappableExprCommon::MappableExprComponentListRef Components; OpenMPMapClauseKind MapType = OMPC_MAP_unknown; ArrayRef MapModifiers; + ArrayRef MotionModifiers; bool ReturnDevicePointer = false; bool IsImplicit = false; const ValueDecl *Mapper = nullptr; @@ -7131,10 +7159,12 @@ class MappableExprsHandler { MapInfo( OMPClauseMappableExprCommon::MappableExprComponentListRef Components, OpenMPMapClauseKind MapType, - ArrayRef MapModifiers, bool ReturnDevicePointer, - bool IsImplicit, const ValueDecl *Mapper = nullptr, - bool ForDeviceAddr = false) + ArrayRef MapModifiers, + ArrayRef MotionModifiers, + bool ReturnDevicePointer, bool IsImplicit, + const ValueDecl *Mapper = nullptr, bool ForDeviceAddr = false) : Components(Components), MapType(MapType), MapModifiers(MapModifiers), + MotionModifiers(MotionModifiers), ReturnDevicePointer(ReturnDevicePointer), IsImplicit(IsImplicit), Mapper(Mapper), ForDeviceAddr(ForDeviceAddr) {} }; @@ -7254,7 +7284,8 @@ class MappableExprsHandler { /// expression. OpenMPOffloadMappingFlags getMapTypeBits( OpenMPMapClauseKind MapType, ArrayRef MapModifiers, - bool IsImplicit, bool AddPtrFlag, bool AddIsTargetParamFlag) const { + ArrayRef MotionModifiers, bool IsImplicit, + bool AddPtrFlag, bool AddIsTargetParamFlag) const { OpenMPOffloadMappingFlags Bits = IsImplicit ? OMP_MAP_IMPLICIT : OMP_MAP_NONE; switch (MapType) { @@ -7293,6 +7324,9 @@ class MappableExprsHandler { if (llvm::find(MapModifiers, OMPC_MAP_MODIFIER_present) != MapModifiers.end()) Bits |= OMP_MAP_PRESENT; + if (llvm::find(MotionModifiers, OMPC_MOTION_MODIFIER_present) + != MotionModifiers.end()) + Bits |= OMP_MAP_PRESENT; return Bits; } @@ -7337,11 +7371,12 @@ class MappableExprsHandler { /// Generate the base pointers, section pointers, sizes, map type bits, and /// user-defined mappers (all included in \a CombinedInfo) for the provided - /// map type, map modifier, and expression components. \a IsFirstComponent - /// should be set to true if the provided set of components is the first - /// associated with a capture. + /// map type, map or motion modifiers, and expression components. + /// \a IsFirstComponent should be set to true if the provided set of + /// components is the first associated with a capture. void generateInfoForComponentList( OpenMPMapClauseKind MapType, ArrayRef MapModifiers, + ArrayRef MotionModifiers, OMPClauseMappableExprCommon::MappableExprComponentListRef Components, MapCombinedInfoTy &CombinedInfo, StructRangeInfoTy &PartialStruct, bool IsFirstComponentList, bool IsImplicit, @@ -7384,10 +7419,9 @@ class MappableExprsHandler { // &p, &p, sizeof(float*), TARGET_PARAM | TO | FROM // // map(p[1:24]) + // &p, &p[1], 24*sizeof(float), TARGET_PARAM | TO | FROM | PTR_AND_OBJ + // in unified shared memory mode or for local pointers // p, &p[1], 24*sizeof(float), TARGET_PARAM | TO | FROM - // for data directives - // p, p, sizeof(float*), TARGET_PARAM | TO | FROM - // p, &p[1], 24*sizeof(float), PTR_AND_OBJ | TO | FROM // // map(s) // &s, &s, sizeof(S2), TARGET_PARAM | TO | FROM @@ -7522,6 +7556,7 @@ class MappableExprsHandler { // Track if the map information being generated is the first for a list of // components. bool IsExpressionFirstInfo = true; + bool FirstPointerInComplexData = false; Address BP = Address::invalid(); const Expr *AssocExpr = I->getAssociatedExpression(); const auto *AE = dyn_cast(AssocExpr); @@ -7564,17 +7599,16 @@ class MappableExprsHandler { QualType Ty = I->getAssociatedDeclaration()->getType().getNonReferenceType(); if (Ty->isAnyPointerType() && std::next(I) != CE) { - BP = CGF.EmitLoadOfPointer(BP, Ty->castAs()); - - // For non-data directives, we do not need to generate individual map - // information for the pointer, it can be associated with the combined - // storage. + // No need to generate individual map information for the pointer, it + // can be associated with the combined storage if shared memory mode is + // active or the base declaration is not global variable. + const auto *VD = dyn_cast(I->getAssociatedDeclaration()); if (CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory() || - !CurDir.is() || - !isOpenMPTargetDataManagementDirective( - CurDir.get() - ->getDirectiveKind())) - ++I; + !VD || VD->hasLocalStorage()) + BP = CGF.EmitLoadOfPointer(BP, Ty->castAs()); + else + FirstPointerInComplexData = true; + ++I; } } @@ -7609,8 +7643,19 @@ class MappableExprsHandler { EncounteredME = dyn_cast(I->getAssociatedExpression()); // If we encounter a PTR_AND_OBJ entry from now on it should be marked // as MEMBER_OF the parent struct. - if (EncounteredME) + if (EncounteredME) { ShouldBeMemberOf = true; + // Do not emit as complex pointer if this is actually not array-like + // expression. + if (FirstPointerInComplexData) { + QualType Ty = std::prev(I) + ->getAssociatedDeclaration() + ->getType() + .getNonReferenceType(); + BP = CGF.EmitLoadOfPointer(BP, Ty->castAs()); + FirstPointerInComplexData = false; + } + } } auto Next = std::next(I); @@ -7694,7 +7739,7 @@ class MappableExprsHandler { // Emit data for non-overlapped data. OpenMPOffloadMappingFlags Flags = OMP_MAP_MEMBER_OF | - getMapTypeBits(MapType, MapModifiers, IsImplicit, + getMapTypeBits(MapType, MapModifiers, MotionModifiers, IsImplicit, /*AddPtrFlag=*/false, /*AddIsTargetParamFlag=*/false); LB = BP; @@ -7750,10 +7795,11 @@ class MappableExprsHandler { // same expression except for the first one. We also need to signal // this map is the first one that relates with the current capture // (there is a set of entries for each capture). - OpenMPOffloadMappingFlags Flags = getMapTypeBits( - MapType, MapModifiers, IsImplicit, - !IsExpressionFirstInfo || RequiresReference, - IsCaptureFirstInfo && !RequiresReference); + OpenMPOffloadMappingFlags Flags = + getMapTypeBits(MapType, MapModifiers, MotionModifiers, IsImplicit, + !IsExpressionFirstInfo || RequiresReference || + FirstPointerInComplexData, + IsCaptureFirstInfo && !RequiresReference); if (!IsExpressionFirstInfo) { // If we have a PTR_AND_OBJ pair where the OBJ is a pointer as well, @@ -7811,6 +7857,7 @@ class MappableExprsHandler { IsExpressionFirstInfo = false; IsCaptureFirstInfo = false; + FirstPointerInComplexData = false; } } } @@ -8017,14 +8064,16 @@ class MappableExprsHandler { OMPClauseMappableExprCommon::MappableExprComponentListRef L, OpenMPMapClauseKind MapType, ArrayRef MapModifiers, + ArrayRef MotionModifiers, bool ReturnDevicePointer, bool IsImplicit, const ValueDecl *Mapper, bool ForDeviceAddr = false) { const ValueDecl *VD = D ? cast(D->getCanonicalDecl()) : nullptr; if (SkipVarSet.count(VD)) return; - Info[VD].emplace_back(L, MapType, MapModifiers, ReturnDevicePointer, - IsImplicit, Mapper, ForDeviceAddr); + Info[VD].emplace_back(L, MapType, MapModifiers, MotionModifiers, + ReturnDevicePointer, IsImplicit, Mapper, + ForDeviceAddr); }; assert(CurDir.is() && @@ -8033,18 +8082,20 @@ class MappableExprsHandler { for (const auto *C : CurExecDir->getClausesOfKind()) for (const auto L : C->component_lists()) { InfoGen(std::get<0>(L), std::get<1>(L), C->getMapType(), - C->getMapTypeModifiers(), /*ReturnDevicePointer=*/false, - C->isImplicit(), std::get<2>(L)); + C->getMapTypeModifiers(), llvm::None, + /*ReturnDevicePointer=*/false, C->isImplicit(), std::get<2>(L)); } for (const auto *C : CurExecDir->getClausesOfKind()) for (const auto L : C->component_lists()) { InfoGen(std::get<0>(L), std::get<1>(L), OMPC_MAP_to, llvm::None, - /*ReturnDevicePointer=*/false, C->isImplicit(), std::get<2>(L)); + C->getMotionModifiers(), /*ReturnDevicePointer=*/false, + C->isImplicit(), std::get<2>(L)); } for (const auto *C : CurExecDir->getClausesOfKind()) for (const auto L : C->component_lists()) { InfoGen(std::get<0>(L), std::get<1>(L), OMPC_MAP_from, llvm::None, - /*ReturnDevicePointer=*/false, C->isImplicit(), std::get<2>(L)); + C->getMotionModifiers(), /*ReturnDevicePointer=*/false, + C->isImplicit(), std::get<2>(L)); } // Look at the use_device_ptr clause information and mark the existing map @@ -8055,6 +8106,7 @@ class MappableExprsHandler { // emission of that entry until the whole struct has been processed. llvm::MapVector> DeferredInfo; + MapCombinedInfoTy UseDevicePtrCombinedInfo; for (const auto *C : CurExecDir->getClausesOfKind()) { @@ -8074,15 +8126,27 @@ class MappableExprsHandler { // We potentially have map information for this declaration already. // Look for the first set of components that refer to it. if (It != Info.end()) { - auto CI = std::find_if( - It->second.begin(), It->second.end(), [VD](const MapInfo &MI) { - return MI.Components.back().getAssociatedDeclaration() == VD; - }); + auto *CI = llvm::find_if(It->second, [VD](const MapInfo &MI) { + return MI.Components.back().getAssociatedDeclaration() == VD; + }); // If we found a map entry, signal that the pointer has to be returned // and move on to the next declaration. + // Exclude cases where the base pointer is mapped as array subscript, + // array section or array shaping. The base address is passed as a + // pointer to base in this case and cannot be used as a base for + // use_device_ptr list item. if (CI != It->second.end()) { - CI->ReturnDevicePointer = true; - continue; + auto PrevCI = std::next(CI->Components.rbegin()); + const auto *VarD = dyn_cast(VD); + if (CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory() || + isa(IE) || + !VD->getType().getNonReferenceType()->isPointerType() || + PrevCI == CI->Components.rend() || + isa(PrevCI->getAssociatedExpression()) || !VarD || + VarD->hasLocalStorage()) { + CI->ReturnDevicePointer = true; + continue; + } } } @@ -8097,19 +8161,19 @@ class MappableExprsHandler { // Nonetheless, generateInfoForComponentList must be called to take // the pointer into account for the calculation of the range of the // partial struct. - InfoGen(nullptr, Components, OMPC_MAP_unknown, llvm::None, + InfoGen(nullptr, Components, OMPC_MAP_unknown, llvm::None, llvm::None, /*ReturnDevicePointer=*/false, C->isImplicit(), nullptr); DeferredInfo[nullptr].emplace_back(IE, VD, /*ForDeviceAddr=*/false); } else { llvm::Value *Ptr = CGF.EmitLoadOfScalar(CGF.EmitLValue(IE), IE->getExprLoc()); - CombinedInfo.BasePointers.emplace_back(Ptr, VD); - CombinedInfo.Pointers.push_back(Ptr); - CombinedInfo.Sizes.push_back( + UseDevicePtrCombinedInfo.BasePointers.emplace_back(Ptr, VD); + UseDevicePtrCombinedInfo.Pointers.push_back(Ptr); + UseDevicePtrCombinedInfo.Sizes.push_back( llvm::Constant::getNullValue(CGF.Int64Ty)); - CombinedInfo.Types.push_back(OMP_MAP_RETURN_PARAM | - OMP_MAP_TARGET_PARAM); - CombinedInfo.Mappers.push_back(nullptr); + UseDevicePtrCombinedInfo.Types.push_back(OMP_MAP_RETURN_PARAM | + OMP_MAP_TARGET_PARAM); + UseDevicePtrCombinedInfo.Mappers.push_back(nullptr); } } } @@ -8162,8 +8226,8 @@ class MappableExprsHandler { // the pointer into account for the calculation of the range of the // partial struct. InfoGen(nullptr, std::get<1>(L), OMPC_MAP_unknown, llvm::None, - /*ReturnDevicePointer=*/false, C->isImplicit(), nullptr, - /*ForDeviceAddr=*/true); + llvm::None, /*ReturnDevicePointer=*/false, C->isImplicit(), + nullptr, /*ForDeviceAddr=*/true); DeferredInfo[nullptr].emplace_back(IE, VD, /*ForDeviceAddr=*/true); } else { llvm::Value *Ptr; @@ -8195,9 +8259,10 @@ class MappableExprsHandler { // Remember the current base pointer index. unsigned CurrentBasePointersIdx = CurInfo.BasePointers.size(); - generateInfoForComponentList( - L.MapType, L.MapModifiers, L.Components, CurInfo, PartialStruct, - IsFirstComponentList, L.IsImplicit, L.Mapper, L.ForDeviceAddr); + generateInfoForComponentList(L.MapType, L.MapModifiers, + L.MotionModifiers, L.Components, CurInfo, + PartialStruct, IsFirstComponentList, + L.IsImplicit, L.Mapper, L.ForDeviceAddr); // If this entry relates with a device pointer, set the relevant // declaration and add the 'return pointer' flag. @@ -8260,6 +8325,8 @@ class MappableExprsHandler { // We need to append the results of this capture to what we already have. CombinedInfo.append(CurInfo); } + // Append data for use_device_ptr clauses. + CombinedInfo.append(UseDevicePtrCombinedInfo); } /// Generate all the base pointers, section pointers, sizes, map types, and @@ -8282,9 +8349,10 @@ class MappableExprsHandler { std::get<0>(L) ? cast(std::get<0>(L)->getCanonicalDecl()) : nullptr; // Get the corresponding user-defined mapper. - Info[VD].emplace_back( - std::get<1>(L), MC->getMapType(), MC->getMapTypeModifiers(), - /*ReturnDevicePointer=*/false, MC->isImplicit(), std::get<2>(L)); + Info[VD].emplace_back(std::get<1>(L), MC->getMapType(), + MC->getMapTypeModifiers(), llvm::None, + /*ReturnDevicePointer=*/false, MC->isImplicit(), + std::get<2>(L)); } } @@ -8300,9 +8368,10 @@ class MappableExprsHandler { for (const MapInfo &L : M.second) { assert(!L.Components.empty() && "Not expecting declaration with no component lists."); - generateInfoForComponentList( - L.MapType, L.MapModifiers, L.Components, CurInfo, PartialStruct, - IsFirstComponentList, L.IsImplicit, L.Mapper, L.ForDeviceAddr); + generateInfoForComponentList(L.MapType, L.MapModifiers, + L.MotionModifiers, L.Components, CurInfo, + PartialStruct, IsFirstComponentList, + L.IsImplicit, L.Mapper, L.ForDeviceAddr); IsFirstComponentList = false; } @@ -8568,9 +8637,9 @@ class MappableExprsHandler { OverlappedComponents = Pair.getSecond(); bool IsFirstComponentList = true; generateInfoForComponentList( - MapType, MapModifiers, Components, CombinedInfo, PartialStruct, - IsFirstComponentList, IsImplicit, Mapper, /*ForDeviceAddr=*/false, - OverlappedComponents); + MapType, MapModifiers, llvm::None, Components, CombinedInfo, + PartialStruct, IsFirstComponentList, IsImplicit, Mapper, + /*ForDeviceAddr=*/false, OverlappedComponents); } // Go through other elements without overlapped elements. bool IsFirstComponentList = OverlappedData.empty(); @@ -8583,8 +8652,8 @@ class MappableExprsHandler { std::tie(Components, MapType, MapModifiers, IsImplicit, Mapper) = L; auto It = OverlappedData.find(&L); if (It == OverlappedData.end()) - generateInfoForComponentList(MapType, MapModifiers, Components, - CombinedInfo, PartialStruct, + generateInfoForComponentList(MapType, MapModifiers, llvm::None, + Components, CombinedInfo, PartialStruct, IsFirstComponentList, IsImplicit, Mapper); IsFirstComponentList = false; } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp new file mode 100644 index 0000000000000..ccffdf43549fe --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp @@ -0,0 +1,61 @@ +//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides a class for OpenMP runtime code generation specialized to +// AMDGCN targets from generalized CGOpenMPRuntimeGPU class. +// +//===----------------------------------------------------------------------===// + +#include "CGOpenMPRuntimeAMDGCN.h" +#include "CGOpenMPRuntimeGPU.h" +#include "CodeGenFunction.h" +#include "clang/AST/Attr.h" +#include "clang/AST/DeclOpenMP.h" +#include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtVisitor.h" +#include "clang/Basic/Cuda.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +using namespace clang; +using namespace CodeGen; +using namespace llvm::omp; + +CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM) + : CGOpenMPRuntimeGPU(CGM) { + if (!CGM.getLangOpts().OpenMPIsDevice) + llvm_unreachable("OpenMP AMDGCN can only handle device code."); +} + +llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + // return constant compile-time target-specific warp size + unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); + return Bld.getInt32(WarpSize); +} + +llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Function *F = + CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x); + return Bld.CreateCall(F, llvm::None, "nvptx_tid"); +} + +llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Module *M = &CGF.CGM.getModule(); + const char *LocSize = "__ockl_get_local_size"; + llvm::Function *F = M->getFunction(LocSize); + if (!F) { + F = llvm::Function::Create( + llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false), + llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); + } + return Bld.CreateTrunc( + Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h new file mode 100644 index 0000000000000..c1421261bfc19 --- /dev/null +++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h @@ -0,0 +1,43 @@ +//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides a class for OpenMP runtime code generation specialized to +// AMDGCN targets from generalized CGOpenMPRuntimeGPU class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H +#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H + +#include "CGOpenMPRuntime.h" +#include "CGOpenMPRuntimeGPU.h" +#include "CodeGenFunction.h" +#include "clang/AST/StmtOpenMP.h" + +namespace clang { +namespace CodeGen { + +class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU { + +public: + explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM); + + /// Get the GPU warp size. + llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override; + + /// Get the id of the current thread on the GPU. + llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; + + /// Get the maximum number of threads in a block of the GPU. + llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; +}; + +} // namespace CodeGen +} // namespace clang + +#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 1cd89c540f478..452eb15eb8d16 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This provides a generalized class for OpenMP runtime code generation -// specialized by GPU target NVPTX. +// specialized by GPU targets NVPTX and AMDGCN. // //===----------------------------------------------------------------------===// @@ -621,14 +621,6 @@ class CheckVarsEscapingDeclContext final }; } // anonymous namespace -/// Get the id of the current thread on the GPU. -static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { - return CGF.EmitRuntimeCall( - llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), - "nvptx_tid"); -} - /// Get the id of the warp in the block. /// We assume that the warp size is 32, which is always the case /// on the NVPTX device, to generate more efficient code. @@ -636,7 +628,8 @@ static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; unsigned LaneIDBits = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2); - return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id"); + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); + return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); } /// Get the id of the current lane in the Warp. @@ -646,18 +639,11 @@ static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue( llvm::omp::GV_Warp_Size_Log2_Mask); - return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask), + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); + return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), "nvptx_lane_id"); } -/// Get the maximum number of threads in a block of the GPU. -static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { - return CGF.EmitRuntimeCall( - llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), - "nvptx_num_threads"); -} - /// Get the value of the thread_limit clause in the teams directive. /// For the 'generic' execution mode, the runtime encodes thread_limit in /// the launch parameters, always starting thread_limit+warpSize threads per @@ -668,9 +654,9 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF, CGBuilderTy &Bld = CGF.Builder; auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); return IsInSPMDExecutionMode - ? getNVPTXNumThreads(CGF) - : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF), - "thread_limit"); + ? RT.getGPUNumThreads(CGF) + : Bld.CreateNUWSub(RT.getGPUNumThreads(CGF), + RT.getGPUWarpSize(CGF), "thread_limit"); } /// Get the thread id of the OMP master thread. @@ -682,8 +668,8 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF, /// If NumThreads is 1024, master id is 992. static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; - llvm::Value *NumThreads = getNVPTXNumThreads(CGF); auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); + llvm::Value *NumThreads = RT.getGPUNumThreads(CGF); // We assume that the warp size is a power of 2. llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1)); @@ -1235,8 +1221,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF, llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); EST.ExitBB = CGF.createBasicBlock(".exit"); + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); llvm::Value *IsWorker = - Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); + Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF)); Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); CGF.EmitBlock(WorkerBB); @@ -1245,7 +1232,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF, CGF.EmitBlock(MasterCheckBB); llvm::Value *IsMaster = - Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); + Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF)); Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); @@ -2780,14 +2767,16 @@ void CGOpenMPRuntimeGPU::emitCriticalRegion( llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); + // Get the mask of active threads in the warp. llvm::Value *Mask = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask)); // Fetch team-local id of the thread. - llvm::Value *ThreadID = getNVPTXThreadID(CGF); + llvm::Value *ThreadID = RT.getGPUThreadID(CGF); // Get the width of the team. - llvm::Value *TeamWidth = getNVPTXNumThreads(CGF); + llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); // Initialize the counter variable for the loop. QualType Int32Ty = @@ -3250,8 +3239,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, CGM.addCompilerUsedGlobal(TransferMedium); } + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); // Get the CUDA thread id of the current OpenMP thread on the GPU. - llvm::Value *ThreadID = getNVPTXThreadID(CGF); + llvm::Value *ThreadID = RT.getGPUThreadID(CGF); // nvptx_lane_id = nvptx_id % warpsize llvm::Value *LaneID = getNVPTXLaneID(CGF); // nvptx_warp_id = nvptx_id / warpsize @@ -4844,9 +4834,11 @@ void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const { + auto &RT = static_cast(CGF.CGM.getOpenMPRuntime()); if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { ScheduleKind = OMPC_DIST_SCHEDULE_static; - Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF), + Chunk = CGF.EmitScalarConversion( + RT.getGPUNumThreads(CGF), CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), S.getIterationVariable()->getType(), S.getBeginLoc()); return; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h index 316333072c5bc..7267511ca672a 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This provides a generalized class for OpenMP runtime code generation -// specialized by GPU target NVPTX. +// specialized by GPU targets NVPTX and AMDGCN. // //===----------------------------------------------------------------------===// @@ -199,9 +199,18 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { void clear() override; /// Declare generalized virtual functions which need to be defined - /// by all specializations of OpenMPGPURuntime Targets. + /// by all specializations of OpenMPGPURuntime Targets like AMDGCN + /// and NVPTX. + + /// Get the GPU warp size. virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0; + /// Get the id of the current thread on the GPU. + virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0; + + /// Get the maximum number of threads in a block of the GPU. + virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0; + /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. virtual void emitProcBindClause(CodeGenFunction &CGF, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 5fefc95ee4130..1688d07b90b6e 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -32,10 +32,25 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) llvm_unreachable("OpenMP NVPTX can only handle device code."); } -/// Get the GPU warp size. llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) { return CGF.EmitRuntimeCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), "nvptx_warp_size"); } + +llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Function *F; + F = llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x); + return Bld.CreateCall(F, llvm::None, "nvptx_tid"); +} + +llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Function *F; + F = llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x); + return Bld.CreateCall(F, llvm::None, "nvptx_num_threads"); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index 6dab79e6e20ae..5f16029592665 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -22,11 +22,19 @@ namespace clang { namespace CodeGen { -class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU { +class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU { public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); + + /// Get the GPU warp size. llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override; + + /// Get the id of the current thread on the GPU. + llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override; + + /// Get the maximum number of threads in a block of the GPU. + llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override; }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 0ee1133ebaa16..df1cc1666de4e 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1707,9 +1707,11 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) { CGCapturedStmtInfo CGSI(*CS, CR_OpenMP); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI); - Builder.restoreIP(OMPBuilder.CreateParallel(Builder, BodyGenCB, PrivCB, - FiniCB, IfCond, NumThreads, - ProcBind, S.hasCancel())); + llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( + AllocaInsertPt->getParent(), AllocaInsertPt->getIterator()); + Builder.restoreIP( + OMPBuilder.CreateParallel(Builder, AllocaIP, BodyGenCB, PrivCB, FiniCB, + IfCond, NumThreads, ProcBind, S.hasCancel())); return; } diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt index a32addd7a898f..5213626b2e822 100644 --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -27,17 +27,6 @@ set(LLVM_LINK_COMPONENTS TransformUtils ) -if (MSVC) - set_source_files_properties(CodeGenModule.cpp PROPERTIES COMPILE_FLAGS /bigobj) -endif() - -get_property(LLVMGenXIntrinsics_SOURCE_DIR GLOBAL PROPERTY LLVMGenXIntrinsics_SOURCE_PROP) -get_property(LLVMGenXIntrinsics_BINARY_DIR GLOBAL PROPERTY LLVMGenXIntrinsics_BINARY_PROP) - -include_directories( - ${LLVMGenXIntrinsics_SOURCE_DIR}/GenXIntrinsics/include - ${LLVMGenXIntrinsics_BINARY_DIR}/GenXIntrinsics/include) - add_clang_library(clangCodeGen BackendUtil.cpp CGAtomic.cpp @@ -70,6 +59,7 @@ add_clang_library(clangCodeGen CGObjCRuntime.cpp CGOpenCLRuntime.cpp CGOpenMPRuntime.cpp + CGOpenMPRuntimeAMDGCN.cpp CGOpenMPRuntimeGPU.cpp CGOpenMPRuntimeNVPTX.cpp CGRecordLayoutBuilder.cpp diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp index bf5ecdd9d9fda..d1cc224dfcd76 100644 --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -1003,11 +1003,9 @@ CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { CoverageSourceInfo *CoverageInfo = nullptr; // Add the preprocessor callback only when the coverage mapping is generated. - if (CI.getCodeGenOpts().CoverageMapping) { - CoverageInfo = new CoverageSourceInfo; - CI.getPreprocessor().addPPCallbacks( - std::unique_ptr(CoverageInfo)); - } + if (CI.getCodeGenOpts().CoverageMapping) + CoverageInfo = CodeGen::CoverageMappingModuleGen::setUpCoverageCallbacks( + CI.getPreprocessor()); std::unique_ptr Result(new BackendConsumer( BA, CI.getDiagnostics(), CI.getHeaderSearchOpts(), diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 43ba631031afc..b0096411d6aa5 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -2134,7 +2134,6 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) { case Type::UnaryTransform: case Type::Attributed: case Type::SubstTemplateTypeParm: - case Type::PackExpansion: case Type::MacroQualified: // Keep walking after single level desugaring. type = type.getSingleStepDesugaredType(getContext()); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 43f0db10f49b9..28d27bc7cf762 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -19,6 +19,7 @@ #include "CGObjCRuntime.h" #include "CGOpenCLRuntime.h" #include "CGOpenMPRuntime.h" +#include "CGOpenMPRuntimeAMDGCN.h" #include "CGOpenMPRuntimeNVPTX.h" #include "CGSYCLRuntime.h" #include "CodeGenFunction.h" @@ -218,6 +219,11 @@ void CodeGenModule::createOpenMPRuntime() { "OpenMP NVPTX is only prepared to deal with device code."); OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this)); break; + case llvm::Triple::amdgcn: + assert(getLangOpts().OpenMPIsDevice && + "OpenMP AMDGCN is only prepared to deal with device code."); + OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this)); + break; default: if (LangOpts.OpenMPSimd) OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this)); diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 78b268f423cbf..9a7096b8d1d02 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -35,8 +35,35 @@ using namespace clang; using namespace CodeGen; using namespace llvm::coverage; +CoverageSourceInfo * +CoverageMappingModuleGen::setUpCoverageCallbacks(Preprocessor &PP) { + CoverageSourceInfo *CoverageInfo = new CoverageSourceInfo(); + PP.addPPCallbacks(std::unique_ptr(CoverageInfo)); + PP.addCommentHandler(CoverageInfo); + PP.setPreprocessToken(true); + PP.setTokenWatcher([CoverageInfo](clang::Token Tok) { + // Update previous token location. + CoverageInfo->PrevTokLoc = Tok.getLocation(); + CoverageInfo->updateNextTokLoc(Tok.getLocation()); + }); + return CoverageInfo; +} + void CoverageSourceInfo::SourceRangeSkipped(SourceRange Range, SourceLocation) { - SkippedRanges.push_back(Range); + SkippedRanges.push_back({Range}); +} + +bool CoverageSourceInfo::HandleComment(Preprocessor &PP, SourceRange Range) { + SkippedRanges.push_back({Range, PrevTokLoc}); + AfterComment = true; + return false; +} + +void CoverageSourceInfo::updateNextTokLoc(SourceLocation Loc) { + if (AfterComment) { + SkippedRanges.back().NextTokLoc = Loc; + AfterComment = false; + } } namespace { @@ -274,8 +301,34 @@ class CoverageMappingBuilder { return None; } + /// This shrinks the skipped range if it spans a line that contains a + /// non-comment token. If shrinking the skipped range would make it empty, + /// this returns None. + Optional adjustSkippedRange(SourceManager &SM, + SpellingRegion SR, + SourceLocation PrevTokLoc, + SourceLocation NextTokLoc) { + // If Range begin location is invalid, it's not a comment region. + if (PrevTokLoc.isInvalid()) + return SR; + unsigned PrevTokLine = SM.getSpellingLineNumber(PrevTokLoc); + unsigned NextTokLine = SM.getSpellingLineNumber(NextTokLoc); + SpellingRegion newSR(SR); + if (SR.LineStart == PrevTokLine) { + newSR.LineStart = SR.LineStart + 1; + newSR.ColumnStart = 1; + } + if (SR.LineEnd == NextTokLine) { + newSR.LineEnd = SR.LineEnd - 1; + newSR.ColumnEnd = SR.ColumnStart + 1; + } + if (newSR.isInSourceOrder()) + return newSR; + return None; + } + /// Gather all the regions that were skipped by the preprocessor - /// using the constructs like #if. + /// using the constructs like #if or comments. void gatherSkippedRegions() { /// An array of the minimum lineStarts and the maximum lineEnds /// for mapping regions from the appropriate source files. @@ -291,9 +344,10 @@ class CoverageMappingBuilder { } auto SkippedRanges = CVM.getSourceInfo().getSkippedRanges(); - for (const auto &I : SkippedRanges) { - auto LocStart = I.getBegin(); - auto LocEnd = I.getEnd(); + for (auto &I : SkippedRanges) { + SourceRange Range = I.Range; + auto LocStart = Range.getBegin(); + auto LocEnd = Range.getEnd(); assert(SM.isWrittenInSameFile(LocStart, LocEnd) && "region spans multiple files"); @@ -301,6 +355,11 @@ class CoverageMappingBuilder { if (!CovFileID) continue; SpellingRegion SR{SM, LocStart, LocEnd}; + if (Optional res = + adjustSkippedRange(SM, SR, I.PrevTokLoc, I.NextTokLoc)) + SR = res.getValue(); + else + continue; auto Region = CounterMappingRegion::makeSkipped( *CovFileID, SR.LineStart, SR.ColumnStart, SR.LineEnd, SR.ColumnEnd); // Make sure that we only collect the regions that are inside diff --git a/clang/lib/CodeGen/CoverageMappingGen.h b/clang/lib/CodeGen/CoverageMappingGen.h index 5d79d1e656703..d3eec226d67ca 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.h +++ b/clang/lib/CodeGen/CoverageMappingGen.h @@ -16,6 +16,7 @@ #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" #include "clang/Lex/PPCallbacks.h" +#include "clang/Lex/Preprocessor.h" #include "llvm/ADT/DenseMap.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/raw_ostream.h" @@ -29,15 +30,40 @@ class Preprocessor; class Decl; class Stmt; +struct SkippedRange { + SourceRange Range; + // The location of token before the skipped source range. + SourceLocation PrevTokLoc; + // The location of token after the skipped source range. + SourceLocation NextTokLoc; + + SkippedRange(SourceRange Range, SourceLocation PrevTokLoc = SourceLocation(), + SourceLocation NextTokLoc = SourceLocation()) + : Range(Range), PrevTokLoc(PrevTokLoc), NextTokLoc(NextTokLoc) {} +}; + /// Stores additional source code information like skipped ranges which /// is required by the coverage mapping generator and is obtained from /// the preprocessor. -class CoverageSourceInfo : public PPCallbacks { - std::vector SkippedRanges; +class CoverageSourceInfo : public PPCallbacks, public CommentHandler { + // A vector of skipped source ranges and PrevTokLoc with NextTokLoc. + std::vector SkippedRanges; + bool AfterComment = false; + public: - ArrayRef getSkippedRanges() const { return SkippedRanges; } + // Location of the token parsed before HandleComment is called. This is + // updated every time Preprocessor::Lex lexes a new token. + SourceLocation PrevTokLoc; + // The location of token before comment. + SourceLocation BeforeCommentLoc; + + std::vector &getSkippedRanges() { return SkippedRanges; } void SourceRangeSkipped(SourceRange Range, SourceLocation EndifLoc) override; + + bool HandleComment(Preprocessor &PP, SourceRange Range) override; + + void updateNextTokLoc(SourceLocation Loc); }; namespace CodeGen { @@ -66,6 +92,8 @@ class CoverageMappingModuleGen { uint64_t FilenamesRef); public: + static CoverageSourceInfo *setUpCoverageCallbacks(Preprocessor &PP); + CoverageMappingModuleGen(CodeGenModule &CGM, CoverageSourceInfo &SourceInfo) : CGM(CGM), SourceInfo(SourceInfo) {} diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp index 8b5b4dd6e53ad..748f4be3eef93 100644 --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -2404,10 +2404,8 @@ class X86_64TargetCodeGenInfo : public TargetCodeGenInfo { } /// Disable tail call on x86-64. The epilogue code before the tail jump blocks - /// the autoreleaseRV/retainRV optimization. - bool shouldSuppressTailCallsOfRetainAutoreleasedReturnValue() const override { - return true; - } + /// autoreleaseRV/retainRV and autoreleaseRV/unsafeClaimRV optimizations. + bool markARCOptimizedReturnCallsAsNoTail() const override { return true; } int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override { return 7; @@ -10745,21 +10743,24 @@ class VEABIInfo : public DefaultABIInfo { } // end anonymous namespace ABIArgInfo VEABIInfo::classifyReturnType(QualType Ty) const { - if (Ty->isAnyComplexType()) { + if (Ty->isAnyComplexType()) return ABIArgInfo::getDirect(); - } + uint64_t Size = getContext().getTypeSize(Ty); + if (Size < 64 && Ty->isIntegerType()) + return ABIArgInfo::getExtend(Ty); return DefaultABIInfo::classifyReturnType(Ty); } ABIArgInfo VEABIInfo::classifyArgumentType(QualType Ty) const { - if (Ty->isAnyComplexType()) { + if (Ty->isAnyComplexType()) return ABIArgInfo::getDirect(); - } + uint64_t Size = getContext().getTypeSize(Ty); + if (Size < 64 && Ty->isIntegerType()) + return ABIArgInfo::getExtend(Ty); return DefaultABIInfo::classifyArgumentType(Ty); } void VEABIInfo::computeInfo(CGFunctionInfo &FI) const { - FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); for (auto &Arg : FI.arguments()) Arg.info = classifyArgumentType(Arg.type); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index 1152cabce4a0d..0df9667e91e16 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -163,11 +163,9 @@ class TargetCodeGenInfo { return ""; } - /// Determine whether a call to objc_retainAutoreleasedReturnValue should be - /// marked as 'notail'. - virtual bool shouldSuppressTailCallsOfRetainAutoreleasedReturnValue() const { - return false; - } + /// Determine whether a call to objc_retainAutoreleasedReturnValue or + /// objc_unsafeClaimAutoreleasedReturnValue should be marked as 'notail'. + virtual bool markARCOptimizedReturnCallsAsNoTail() const { return false; } /// Return a constant used by UBSan as a signature to identify functions /// possessing type information, or 0 if the platform is unsupported. diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 56ca80cef827b..a8e045ddbf1bc 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6094,10 +6094,20 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, } SmallString<128> BasePath(BaseInput); + SmallString<128> ExternalPath(""); StringRef BaseName; // Dsymutil actions should use the full path. - if (isa(JA) || isa(JA)) + if (isa(JA) && C.getArgs().hasArg(options::OPT_dsym_dir)) { + ExternalPath += C.getArgs().getLastArg(options::OPT_dsym_dir)->getValue(); + // We use posix style here because the tests (specifically + // darwin-dsymutil.c) demonstrate that posix style paths are acceptable + // even on Windows and if we don't then the similar test covering this + // fails. + llvm::sys::path::append(ExternalPath, llvm::sys::path::Style::posix, + llvm::sys::path::filename(BasePath)); + BaseName = ExternalPath; + } else if (isa(JA) || isa(JA)) BaseName = BasePath; else BaseName = llvm::sys::path::filename(BasePath); diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index e4fda752c041d..8c49e92b2c0f9 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -1088,6 +1088,23 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args, Sanitizers.has(SanitizerKind::Address)) CmdArgs.push_back("-fno-assume-sane-operator-new"); + // libFuzzer wants to intercept calls to certain library functions, so the + // following -fno-builtin-* flags force the compiler to emit interposable + // libcalls to these functions. Other sanitizers effectively do the same thing + // by marking all library call sites with NoBuiltin attribute in their LLVM + // pass. (see llvm::maybeMarkSanitizerLibraryCallNoBuiltin) + if (Sanitizers.has(SanitizerKind::FuzzerNoLink)) { + CmdArgs.push_back("-fno-builtin-bcmp"); + CmdArgs.push_back("-fno-builtin-memcmp"); + CmdArgs.push_back("-fno-builtin-strncmp"); + CmdArgs.push_back("-fno-builtin-strcmp"); + CmdArgs.push_back("-fno-builtin-strncasecmp"); + CmdArgs.push_back("-fno-builtin-strcasecmp"); + CmdArgs.push_back("-fno-builtin-strstr"); + CmdArgs.push_back("-fno-builtin-strcasestr"); + CmdArgs.push_back("-fno-builtin-memmem"); + } + // Require -fvisibility= flag on non-Windows when compiling if vptr CFI is // enabled. if (Sanitizers.hasOneOf(CFIClasses) && !TC.getTriple().isOSWindows() && diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index ac5544eedb00b..f9d8e18d6fd01 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -44,12 +44,6 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-a64"); } - // Accept an undefined symbol as an extern so that an error message is not - // displayed. Otherwise, undefined symbols are flagged with error messages. - // FIXME: This should be removed when the assembly generation from the - // compiler is able to write externs properly. - CmdArgs.push_back("-u"); - // Accept any mixture of instructions. // On Power for AIX and Linux, this behaviour matches that of GCC for both the // user-provided assembler source case and the compiler-produced assembler diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp index afe896b4a65bf..d74d5db0c083d 100644 --- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp +++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp @@ -73,14 +73,15 @@ static unsigned getARMFPUFeatures(const Driver &D, const Arg *A, } // Decode ARM features from string like +[no]featureA+[no]featureB+... -static bool DecodeARMFeatures(const Driver &D, StringRef text, - StringRef CPU, llvm::ARM::ArchKind ArchKind, - std::vector &Features) { +static bool DecodeARMFeatures(const Driver &D, StringRef text, StringRef CPU, + llvm::ARM::ArchKind ArchKind, + std::vector &Features, + unsigned &ArgFPUID) { SmallVector Split; text.split(Split, StringRef("+"), -1, false); for (StringRef Feature : Split) { - if (!appendArchExtFeatures(CPU, ArchKind, Feature, Features)) + if (!appendArchExtFeatures(CPU, ArchKind, Feature, Features, ArgFPUID)) return false; } return true; @@ -102,14 +103,14 @@ static void DecodeARMFeaturesFromCPU(const Driver &D, StringRef CPU, static void checkARMArchName(const Driver &D, const Arg *A, const ArgList &Args, llvm::StringRef ArchName, llvm::StringRef CPUName, std::vector &Features, - const llvm::Triple &Triple) { + const llvm::Triple &Triple, unsigned &ArgFPUID) { std::pair Split = ArchName.split("+"); std::string MArch = arm::getARMArch(ArchName, Triple); llvm::ARM::ArchKind ArchKind = llvm::ARM::parseArch(MArch); if (ArchKind == llvm::ARM::ArchKind::INVALID || - (Split.second.size() && !DecodeARMFeatures( - D, Split.second, CPUName, ArchKind, Features))) + (Split.second.size() && !DecodeARMFeatures(D, Split.second, CPUName, + ArchKind, Features, ArgFPUID))) D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args); } @@ -117,15 +118,15 @@ static void checkARMArchName(const Driver &D, const Arg *A, const ArgList &Args, static void checkARMCPUName(const Driver &D, const Arg *A, const ArgList &Args, llvm::StringRef CPUName, llvm::StringRef ArchName, std::vector &Features, - const llvm::Triple &Triple) { + const llvm::Triple &Triple, unsigned &ArgFPUID) { std::pair Split = CPUName.split("+"); std::string CPU = arm::getARMTargetCPU(CPUName, ArchName, Triple); llvm::ARM::ArchKind ArchKind = arm::getLLVMArchKindForARM(CPU, ArchName, Triple); if (ArchKind == llvm::ARM::ArchKind::INVALID || - (Split.second.size() && !DecodeARMFeatures( - D, Split.second, CPU, ArchKind, Features))) + (Split.second.size() && + !DecodeARMFeatures(D, Split.second, CPU, ArchKind, Features, ArgFPUID))) D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args); } @@ -347,6 +348,8 @@ void arm::getARMTargetFeatures(const Driver &D, const llvm::Triple &Triple, const Arg *CPUArg = Args.getLastArg(options::OPT_mcpu_EQ); StringRef ArchName; StringRef CPUName; + unsigned ArchArgFPUID = llvm::ARM::FK_INVALID; + unsigned CPUArgFPUID = llvm::ARM::FK_INVALID; // Check -mcpu. ClangAs gives preference to -Wa,-mcpu=. if (WaCPU) { @@ -364,14 +367,14 @@ void arm::getARMTargetFeatures(const Driver &D, const llvm::Triple &Triple, D.Diag(clang::diag::warn_drv_unused_argument) << ArchArg->getAsString(Args); ArchName = StringRef(WaArch->getValue()).substr(7); - checkARMArchName(D, WaArch, Args, ArchName, CPUName, - ExtensionFeatures, Triple); + checkARMArchName(D, WaArch, Args, ArchName, CPUName, ExtensionFeatures, + Triple, ArchArgFPUID); // FIXME: Set Arch. D.Diag(clang::diag::warn_drv_unused_argument) << WaArch->getAsString(Args); } else if (ArchArg) { ArchName = ArchArg->getValue(); - checkARMArchName(D, ArchArg, Args, ArchName, CPUName, - ExtensionFeatures, Triple); + checkARMArchName(D, ArchArg, Args, ArchName, CPUName, ExtensionFeatures, + Triple, ArchArgFPUID); } // Add CPU features for generic CPUs @@ -390,8 +393,8 @@ void arm::getARMTargetFeatures(const Driver &D, const llvm::Triple &Triple, } if (CPUArg) - checkARMCPUName(D, CPUArg, Args, CPUName, ArchName, - ExtensionFeatures, Triple); + checkARMCPUName(D, CPUArg, Args, CPUName, ArchName, ExtensionFeatures, + Triple, CPUArgFPUID); // Honor -mfpu=. ClangAs gives preference to -Wa,-mfpu=. unsigned FPUID = llvm::ARM::FK_INVALID; const Arg *FPUArg = Args.getLastArg(options::OPT_mfpu_EQ); @@ -455,20 +458,26 @@ void arm::getARMTargetFeatures(const Driver &D, const llvm::Triple &Triple, Features.push_back("+fullfp16"); } - // Setting -msoft-float/-mfloat-abi=soft effectively disables the FPU (GCC - // ignores the -mfpu options in this case). - // Note that the ABI can also be set implicitly by the target selected. + // Setting -msoft-float/-mfloat-abi=soft, -mfpu=none, or adding +nofp to + // -march/-mcpu effectively disables the FPU (GCC ignores the -mfpu options in + // this case). Note that the ABI can also be set implicitly by the target + // selected. if (ABI == arm::FloatABI::Soft) { llvm::ARM::getFPUFeatures(llvm::ARM::FK_NONE, Features); // Disable all features relating to hardware FP, not already disabled by the // above call. + Features.insert(Features.end(), {"-dotprod", "-fp16fml", "-bf16", "-mve", + "-mve.fp", "-fpregs"}); + } else if (FPUID == llvm::ARM::FK_NONE || + ArchArgFPUID == llvm::ARM::FK_NONE || + CPUArgFPUID == llvm::ARM::FK_NONE) { + // -mfpu=none, -march=armvX+nofp or -mcpu=X+nofp is *very* similar to + // -mfloat-abi=soft, only that it should not disable MVE-I. They disable the + // FPU, but not the FPU registers, thus MVE-I, which depends only on the + // latter, is still supported. Features.insert(Features.end(), - {"-dotprod", "-fp16fml", "-mve", "-mve.fp", "-fpregs"}); - } else if (FPUID == llvm::ARM::FK_NONE) { - // -mfpu=none is *very* similar to -mfloat-abi=soft, only that it should not - // disable MVE-I. - Features.insert(Features.end(), {"-dotprod", "-fp16fml", "-mve.fp"}); + {"-dotprod", "-fp16fml", "-bf16", "-mve.fp"}); if (!hasIntegerMVE(Features)) Features.emplace_back("-fpregs"); } diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index 144e276a6bd87..bcaecf4b2d980 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -57,7 +57,6 @@ std::string ppc::getPPCTargetCPU(const ArgList &Args) { .Case("970", "970") .Case("G5", "g5") .Case("a2", "a2") - .Case("a2q", "a2q") .Case("e500", "e500") .Case("e500mc", "e500mc") .Case("e5500", "e5500") diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp index 09ae4538b3acc..7ca05a1f3a395 100644 --- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp +++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp @@ -63,7 +63,7 @@ isExperimentalExtension(StringRef Ext) { Ext == "zbs" || Ext == "zbt" || Ext == "zbproposedc") return RISCVExtensionVersion{"0", "92"}; if (Ext == "v") - return RISCVExtensionVersion{"0", "8"}; + return RISCVExtensionVersion{"0", "9"}; return None; } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 4d1b12e337dbb..64a03682bb748 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -508,7 +508,7 @@ static codegenoptions::DebugInfoKind DebugLevelToInfoKind(const Arg &A) { return codegenoptions::DebugLineTablesOnly; if (A.getOption().matches(options::OPT_gline_directives_only)) return codegenoptions::DebugDirectivesOnly; - return codegenoptions::DebugInfoConstructor; + return codegenoptions::LimitedDebugInfo; } static bool mustUseNonLeafFramePointerForTarget(const llvm::Triple &Triple) { @@ -1926,18 +1926,6 @@ void Clang::AddPPCTargetArgs(const ArgList &Args, if (T.isOSBinFormatELF()) { switch (getToolChain().getArch()) { case llvm::Triple::ppc64: { - // When targeting a processor that supports QPX, or if QPX is - // specifically enabled, default to using the ABI that supports QPX (so - // long as it is not specifically disabled). - bool HasQPX = false; - if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) - HasQPX = A->getValue() == StringRef("a2q"); - HasQPX = Args.hasFlag(options::OPT_mqpx, options::OPT_mno_qpx, HasQPX); - if (HasQPX) { - ABIName = "elfv1-qpx"; - break; - } - if (T.isMusl() || (T.isOSFreeBSD() && T.getOSMajorVersion() >= 13)) ABIName = "elfv2"; else @@ -2438,7 +2426,7 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, CmdArgs.push_back(Value.data()); } else { RenderDebugEnablingArgs(Args, CmdArgs, - codegenoptions::DebugInfoConstructor, + codegenoptions::LimitedDebugInfo, DwarfVersion, llvm::DebuggerKind::Default); } } else if (Value.startswith("-mcpu") || Value.startswith("-mfpu") || @@ -3711,7 +3699,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D, if (const Arg *A = Args.getLastArg(options::OPT_g_Group, options::OPT_gsplit_dwarf, options::OPT_gsplit_dwarf_EQ)) { - DebugInfoKind = codegenoptions::DebugInfoConstructor; + DebugInfoKind = codegenoptions::LimitedDebugInfo; // If the last option explicitly specified a debug-info level, use it. if (checkDebugInfoOption(A, Args, D, TC) && @@ -3816,7 +3804,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D, if (checkDebugInfoOption(A, Args, D, TC)) { if (DebugInfoKind != codegenoptions::DebugLineTablesOnly && DebugInfoKind != codegenoptions::DebugDirectivesOnly) { - DebugInfoKind = codegenoptions::DebugInfoConstructor; + DebugInfoKind = codegenoptions::LimitedDebugInfo; CmdArgs.push_back("-dwarf-ext-refs"); CmdArgs.push_back("-fmodule-format=obj"); } @@ -3836,9 +3824,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D, TC.GetDefaultStandaloneDebug()); if (const Arg *A = Args.getLastArg(options::OPT_fstandalone_debug)) (void)checkDebugInfoOption(A, Args, D, TC); - if ((DebugInfoKind == codegenoptions::LimitedDebugInfo || - DebugInfoKind == codegenoptions::DebugInfoConstructor) && - NeedFullDebug) + if (DebugInfoKind == codegenoptions::LimitedDebugInfo && NeedFullDebug) DebugInfoKind = codegenoptions::FullDebugInfo; if (Args.hasFlag(options::OPT_gembed_source, options::OPT_gno_embed_source, @@ -6787,7 +6773,7 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType, options::OPT_gline_tables_only)) { *EmitCodeView = true; if (DebugInfoArg->getOption().matches(options::OPT__SLASH_Z7)) - *DebugInfoKind = codegenoptions::DebugInfoConstructor; + *DebugInfoKind = codegenoptions::LimitedDebugInfo; else *DebugInfoKind = codegenoptions::DebugLineTablesOnly; } else { @@ -7088,7 +7074,7 @@ void ClangAs::ConstructJob(Compilation &C, const JobAction &JA, // the guard for source type, however there is a test which asserts // that some assembler invocation receives no -debug-info-kind, // and it's not clear whether that test is just overly restrictive. - DebugInfoKind = (WantDebug ? codegenoptions::DebugInfoConstructor + DebugInfoKind = (WantDebug ? codegenoptions::LimitedDebugInfo : codegenoptions::NoDebugInfo); // Add the -fdebug-compilation-dir flag if needed. addDebugCompDirArg(Args, CmdArgs, C.getDriver().getVFS()); diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index f910c88fa9674..cf8a3e6aee273 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1062,10 +1062,9 @@ void MachO::AddLinkRuntimeLib(const ArgList &Args, ArgStringList &CmdArgs, DarwinLibName += Component; if (!(Opts & RLO_IsEmbedded)) DarwinLibName += "_"; - DarwinLibName += getOSLibraryNameSuffix(); - } else - DarwinLibName += getOSLibraryNameSuffix(true); + } + DarwinLibName += getOSLibraryNameSuffix(); DarwinLibName += IsShared ? "_dynamic.dylib" : ".a"; SmallString<128> Dir(getDriver().ResourceDir); llvm::sys::path::append( @@ -2714,6 +2713,7 @@ void Darwin::CheckObjCARC() const { SanitizerMask Darwin::getSupportedSanitizers() const { const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64; + const bool IsAArch64 = getTriple().getArch() == llvm::Triple::aarch64; SanitizerMask Res = ToolChain::getSupportedSanitizers(); Res |= SanitizerKind::Address; Res |= SanitizerKind::PointerCompare; @@ -2731,9 +2731,8 @@ SanitizerMask Darwin::getSupportedSanitizers() const { && !(isTargetIPhoneOS() && isIPhoneOSVersionLT(5, 0))) Res |= SanitizerKind::Vptr; - if (isTargetMacOS()) { - if (IsX86_64) - Res |= SanitizerKind::Thread; + if ((IsX86_64 || IsAArch64) && isTargetMacOS()) { + Res |= SanitizerKind::Thread; } else if (isTargetIOSSimulator() || isTargetTvOSSimulator()) { if (IsX86_64) Res |= SanitizerKind::Thread; diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index e476f296b0caa..465414af36868 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -38,10 +38,10 @@ using tools::addMultilibFlag; using tools::addPathIfExists; static bool forwardToGCC(const Option &O) { - // Don't forward inputs from the original command line. They are added from - // InputInfoList. - return O.getKind() != Option::InputClass && - !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput); + // LinkerInput options have been forwarded. Don't duplicate. + if (O.hasFlag(options::LinkerInput)) + return false; + return O.matches(options::OPT_Link_Group) || O.hasFlag(options::LinkOption); } // Switch CPU names not recognized by GNU assembler to a close CPU that it does @@ -76,23 +76,6 @@ void tools::gcc::Common::ConstructJob(Compilation &C, const JobAction &JA, // to get to the assembler. A->claim(); - // Don't forward any -g arguments to assembly steps. - if (isa(JA) && - A->getOption().matches(options::OPT_g_Group)) - continue; - - // Don't forward any -W arguments to assembly and link steps. - if ((isa(JA) || isa(JA)) && - A->getOption().matches(options::OPT_W_Group)) - continue; - - // Don't forward -mno-unaligned-access since GCC doesn't understand - // it and because it doesn't affect the assembly or link steps. - if ((isa(JA) || isa(JA)) && - (A->getOption().matches(options::OPT_munaligned_access) || - A->getOption().matches(options::OPT_mno_unaligned_access))) - continue; - A->render(Args, CmdArgs); } } diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index b1497651a8fef..f3202bcb5bc14 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -284,7 +284,7 @@ bool ContinuationIndenter::canBreak(const LineState &State) { // The opening "{" of a braced list has to be on the same line as the first // element if it is nested in another braced init list or function call. if (!Current.MustBreakBefore && Previous.is(tok::l_brace) && - Previous.isNot(TT_DictLiteral) && Previous.BlockKind == BK_BracedInit && + Previous.isNot(TT_DictLiteral) && Previous.is(BK_BracedInit) && Previous.Previous && Previous.Previous->isOneOf(tok::l_brace, tok::l_paren, tok::comma)) return false; @@ -501,7 +501,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) { // The following could be precomputed as they do not depend on the state. // However, as they should take effect only if the UnwrappedLine does not fit // into the ColumnLimit, they are checked here in the ContinuationIndenter. - if (Style.ColumnLimit != 0 && Previous.BlockKind == BK_Block && + if (Style.ColumnLimit != 0 && Previous.is(BK_Block) && Previous.is(tok::l_brace) && !Current.isOneOf(tok::r_brace, tok::comment)) return true; @@ -627,7 +627,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // opening parenthesis. Don't break if it doesn't conserve columns. if (Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak && (Previous.isOneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) || - (Previous.is(tok::l_brace) && Previous.BlockKind != BK_Block && + (Previous.is(tok::l_brace) && Previous.isNot(BK_Block) && Style.Cpp11BracedListStyle)) && State.Column > getNewLineColumn(State) && (!Previous.Previous || !Previous.Previous->isOneOf( @@ -648,7 +648,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign && !State.Stack.back().IsCSharpGenericTypeConstraint && Previous.opensScope() && Previous.isNot(TT_ObjCMethodExpr) && - (Current.isNot(TT_LineComment) || Previous.BlockKind == BK_BracedInit)) { + (Current.isNot(TT_LineComment) || Previous.is(BK_BracedInit))) { State.Stack.back().Indent = State.Column + Spaces; State.Stack.back().IsAligned = true; } @@ -972,7 +972,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { return (Style.IndentWidth * State.Line->First->IndentLevel) + Style.IndentWidth; - if (NextNonComment->is(tok::l_brace) && NextNonComment->BlockKind == BK_Block) + if (NextNonComment->is(tok::l_brace) && NextNonComment->is(BK_Block)) return Current.NestingLevel == 0 ? State.FirstIndent : State.Stack.back().Indent; if ((Current.isOneOf(tok::r_brace, tok::r_square) || @@ -982,8 +982,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { State.Stack.size() > 1) { if (Current.closesBlockOrBlockTypeList(Style)) return State.Stack[State.Stack.size() - 2].NestedBlockIndent; - if (Current.MatchingParen && - Current.MatchingParen->BlockKind == BK_BracedInit) + if (Current.MatchingParen && Current.MatchingParen->is(BK_BracedInit)) return State.Stack[State.Stack.size() - 2].LastSpace; return State.FirstIndent; } @@ -1417,7 +1416,7 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State, State.Stack.back().IsCSharpGenericTypeConstraint) return; - if (Current.MatchingParen && Current.BlockKind == BK_Block) { + if (Current.MatchingParen && Current.is(BK_Block)) { moveStateToNewBlock(State); return; } @@ -1486,9 +1485,8 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State, (State.Line->MustBeDeclaration && !BinPackDeclaration) || (!State.Line->MustBeDeclaration && !Style.BinPackArguments) || (Style.ExperimentalAutoDetectBinPacking && - (Current.PackingKind == PPK_OnePerLine || - (!BinPackInconclusiveFunctions && - Current.PackingKind == PPK_Inconclusive))); + (Current.is(PPK_OnePerLine) || + (!BinPackInconclusiveFunctions && Current.is(PPK_Inconclusive)))); if (Current.is(TT_ObjCMethodExpr) && Current.MatchingParen && Style.ObjCBreakBeforeNestedBlockParam) { diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 3966f0a38639b..8c1d7c90e02a0 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -1575,9 +1575,9 @@ class Formatter : public TokenAnalyzer { continue; FormatToken *Tok = AnnotatedLines[i]->First->Next; while (Tok->Next) { - if (Tok->PackingKind == PPK_BinPacked) + if (Tok->is(PPK_BinPacked)) HasBinPackedFunction = true; - if (Tok->PackingKind == PPK_OnePerLine) + if (Tok->is(PPK_OnePerLine)) HasOnePerLineFunction = true; Tok = Tok->Next; diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index 7d792974cd577..4bc865b043fd2 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -85,8 +85,8 @@ unsigned CommaSeparatedList::formatAfterToken(LineState &State, const FormatToken *LBrace = State.NextToken->Previous->getPreviousNonComment(); if (!LBrace || !LBrace->isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) || - LBrace->BlockKind == BK_Block || LBrace->getType() == TT_DictLiteral || - LBrace->Next->getType() == TT_DesignatedInitializerPeriod) + LBrace->is(BK_Block) || LBrace->is(TT_DictLiteral) || + LBrace->Next->is(TT_DesignatedInitializerPeriod)) return 0; // Calculate the number of code points we have to format this list. As the diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index d4287f53fde37..ece1bf4b97f7b 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -140,80 +140,156 @@ class AnnotatedLine; /// A wrapper around a \c Token storing information about the /// whitespace characters preceding it. struct FormatToken { - FormatToken() {} + FormatToken() + : HasUnescapedNewline(false), IsMultiline(false), IsFirst(false), + MustBreakBefore(false), MustBreakAlignBefore(false), + IsUnterminatedLiteral(false), CanBreakBefore(false), + ClosesTemplateDeclaration(false), StartsBinaryExpression(false), + EndsBinaryExpression(false), PartOfMultiVariableDeclStmt(false), + ContinuesLineCommentSection(false), Finalized(false), + BlockKind(BK_Unknown), Type(TT_Unknown), Decision(FD_Unformatted), + PackingKind(PPK_Inconclusive) {} /// The \c Token. Token Tok; - /// The number of newlines immediately before the \c Token. + /// The raw text of the token. /// - /// This can be used to determine what the user wrote in the original code - /// and thereby e.g. leave an empty line between two function definitions. - unsigned NewlinesBefore = 0; + /// Contains the raw token text without leading whitespace and without leading + /// escaped newlines. + StringRef TokenText; - /// Whether there is at least one unescaped newline before the \c - /// Token. - bool HasUnescapedNewline = false; + /// A token can have a special role that can carry extra information + /// about the token's formatting. + std::unique_ptr Role; /// The range of the whitespace immediately preceding the \c Token. SourceRange WhitespaceRange; - /// The offset just past the last '\n' in this token's leading - /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'. - unsigned LastNewlineOffset = 0; - - /// The width of the non-whitespace parts of the token (or its first - /// line for multi-line tokens) in columns. - /// We need this to correctly measure number of columns a token spans. - unsigned ColumnWidth = 0; - - /// Contains the width in columns of the last line of a multi-line - /// token. - unsigned LastLineColumnWidth = 0; + /// Whether there is at least one unescaped newline before the \c + /// Token. + unsigned HasUnescapedNewline : 1; /// Whether the token text contains newlines (escaped or not). - bool IsMultiline = false; + unsigned IsMultiline : 1; /// Indicates that this is the first token of the file. - bool IsFirst = false; + unsigned IsFirst : 1; /// Whether there must be a line break before this token. /// /// This happens for example when a preprocessor directive ended directly /// before the token. - bool MustBreakBefore = false; + unsigned MustBreakBefore : 1; /// Whether to not align across this token /// /// This happens for example when a preprocessor directive ended directly /// before the token, but very rarely otherwise. - bool MustBreakAlignBefore = false; + unsigned MustBreakAlignBefore : 1; - /// The raw text of the token. + /// Set to \c true if this token is an unterminated literal. + unsigned IsUnterminatedLiteral : 1; + + /// \c true if it is allowed to break before this token. + unsigned CanBreakBefore : 1; + + /// \c true if this is the ">" of "template<..>". + unsigned ClosesTemplateDeclaration : 1; + + /// \c true if this token starts a binary expression, i.e. has at least + /// one fake l_paren with a precedence greater than prec::Unknown. + unsigned StartsBinaryExpression : 1; + /// \c true if this token ends a binary expression. + unsigned EndsBinaryExpression : 1; + + /// Is this token part of a \c DeclStmt defining multiple variables? /// - /// Contains the raw token text without leading whitespace and without leading - /// escaped newlines. - StringRef TokenText; + /// Only set if \c Type == \c TT_StartOfName. + unsigned PartOfMultiVariableDeclStmt : 1; - /// Set to \c true if this token is an unterminated literal. - bool IsUnterminatedLiteral = 0; + /// Does this line comment continue a line comment section? + /// + /// Only set to true if \c Type == \c TT_LineComment. + unsigned ContinuesLineCommentSection : 1; + /// If \c true, this token has been fully formatted (indented and + /// potentially re-formatted inside), and we do not allow further formatting + /// changes. + unsigned Finalized : 1; + +private: /// Contains the kind of block if this token is a brace. - BraceBlockKind BlockKind = BK_Unknown; + unsigned BlockKind : 2; +public: + BraceBlockKind getBlockKind() const { + return static_cast(BlockKind); + } + void setBlockKind(BraceBlockKind BBK) { + BlockKind = BBK; + assert(getBlockKind() == BBK && "BraceBlockKind overflow!"); + } + +private: + unsigned Type : 8; + +public: /// Returns the token's type, e.g. whether "<" is a template opener or /// binary operator. - TokenType getType() const { return Type; } - void setType(TokenType T) { Type = T; } + TokenType getType() const { return static_cast(Type); } + void setType(TokenType T) { + Type = T; + assert(getType() == T && "TokenType overflow!"); + } - /// The number of spaces that should be inserted before this token. - unsigned SpacesRequiredBefore = 0; +private: + /// Stores the formatting decision for the token once it was made. + unsigned Decision : 2; - /// \c true if it is allowed to break before this token. - bool CanBreakBefore = false; +public: + FormatDecision getDecision() const { + return static_cast(Decision); + } + void setDecision(FormatDecision D) { + Decision = D; + assert(getDecision() == D && "FormatDecision overflow!"); + } - /// \c true if this is the ">" of "template<..>". - bool ClosesTemplateDeclaration = false; +private: + /// If this is an opening parenthesis, how are the parameters packed? + unsigned PackingKind : 2; + +public: + ParameterPackingKind getPackingKind() const { + return static_cast(PackingKind); + } + void setPackingKind(ParameterPackingKind K) { + PackingKind = K; + assert(getPackingKind() == K && "ParameterPackingKind overflow!"); + } + + /// The number of newlines immediately before the \c Token. + /// + /// This can be used to determine what the user wrote in the original code + /// and thereby e.g. leave an empty line between two function definitions. + unsigned NewlinesBefore = 0; + + /// The offset just past the last '\n' in this token's leading + /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'. + unsigned LastNewlineOffset = 0; + + /// The width of the non-whitespace parts of the token (or its first + /// line for multi-line tokens) in columns. + /// We need this to correctly measure number of columns a token spans. + unsigned ColumnWidth = 0; + + /// Contains the width in columns of the last line of a multi-line + /// token. + unsigned LastLineColumnWidth = 0; + + /// The number of spaces that should be inserted before this token. + unsigned SpacesRequiredBefore = 0; /// Number of parameters, if this is "(", "[" or "<". unsigned ParameterCount = 0; @@ -226,13 +302,6 @@ struct FormatToken { /// the surrounding bracket. tok::TokenKind ParentBracket = tok::unknown; - /// A token can have a special role that can carry extra information - /// about the token's formatting. - std::unique_ptr Role; - - /// If this is an opening parenthesis, how are the parameters packed? - ParameterPackingKind PackingKind = PPK_Inconclusive; - /// The total length of the unwrapped line up to and including this /// token. unsigned TotalLength = 0; @@ -286,12 +355,6 @@ struct FormatToken { /// Insert this many fake ) after this token for correct indentation. unsigned FakeRParens = 0; - /// \c true if this token starts a binary expression, i.e. has at least - /// one fake l_paren with a precedence greater than prec::Unknown. - bool StartsBinaryExpression = false; - /// \c true if this token ends a binary expression. - bool EndsBinaryExpression = false; - /// If this is an operator (or "."/"->") in a sequence of operators /// with the same precedence, contains the 0-based operator index. unsigned OperatorIndex = 0; @@ -300,16 +363,6 @@ struct FormatToken { /// with the same precedence, points to the next operator. FormatToken *NextOperator = nullptr; - /// Is this token part of a \c DeclStmt defining multiple variables? - /// - /// Only set if \c Type == \c TT_StartOfName. - bool PartOfMultiVariableDeclStmt = false; - - /// Does this line comment continue a line comment section? - /// - /// Only set to true if \c Type == \c TT_LineComment. - bool ContinuesLineCommentSection = false; - /// If this is a bracket, this points to the matching one. FormatToken *MatchingParen = nullptr; @@ -323,16 +376,8 @@ struct FormatToken { /// in it. SmallVector Children; - /// Stores the formatting decision for the token once it was made. - FormatDecision Decision = FD_Unformatted; - - /// If \c true, this token has been fully formatted (indented and - /// potentially re-formatted inside), and we do not allow further formatting - /// changes. - bool Finalized = false; - bool is(tok::TokenKind Kind) const { return Tok.is(Kind); } - bool is(TokenType TT) const { return Type == TT; } + bool is(TokenType TT) const { return getType() == TT; } bool is(const IdentifierInfo *II) const { return II && II == Tok.getIdentifierInfo(); } @@ -340,6 +385,9 @@ struct FormatToken { return Tok.getIdentifierInfo() && Tok.getIdentifierInfo()->getPPKeywordID() == Kind; } + bool is(BraceBlockKind BBK) const { return getBlockKind() == BBK; } + bool is(ParameterPackingKind PPK) const { return getPackingKind() == PPK; } + template bool isOneOf(A K1, B K2) const { return is(K1) || is(K2); } @@ -355,7 +403,7 @@ struct FormatToken { } bool closesScopeAfterBlock() const { - if (BlockKind == BK_Block) + if (getBlockKind() == BK_Block) return true; if (closesScope()) return Previous->closesScopeAfterBlock(); @@ -525,13 +573,13 @@ struct FormatToken { /// list that should be indented with a block indent. bool opensBlockOrBlockTypeList(const FormatStyle &Style) const { // C# Does not indent object initialisers as continuations. - if (is(tok::l_brace) && BlockKind == BK_BracedInit && Style.isCSharp()) + if (is(tok::l_brace) && getBlockKind() == BK_BracedInit && Style.isCSharp()) return true; if (is(TT_TemplateString) && opensScope()) return true; return is(TT_ArrayInitializerLSquare) || is(TT_ProtoExtensionLSquare) || (is(tok::l_brace) && - (BlockKind == BK_Block || is(TT_DictLiteral) || + (getBlockKind() == BK_Block || is(TT_DictLiteral) || (!Style.Cpp11BracedListStyle && NestingLevel == 0))) || (is(tok::less) && (Style.Language == FormatStyle::LK_Proto || Style.Language == FormatStyle::LK_TextProto)); @@ -602,8 +650,6 @@ struct FormatToken { return Previous->endsSequenceInternal(K1, Tokens...); return is(K1) && Previous && Previous->endsSequenceInternal(Tokens...); } - - TokenType Type = TT_Unknown; }; class ContinuationIndenter; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 0ab09b4a1218a..6cbaf8a30812b 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -27,7 +27,7 @@ namespace format { namespace { /// Returns \c true if the token can be used as an identifier in -/// an Objective-C \c @selector, \c false otherwise. +/// an Objective-C \c \@selector, \c false otherwise. /// /// Because getFormattingLangOpts() always lexes source code as /// Objective-C++, C++ keywords like \c new and \c delete are @@ -343,11 +343,11 @@ class AnnotatingParser { CurrentToken->setType(TT_AttributeSquare); if (!HasMultipleLines) - Left->PackingKind = PPK_Inconclusive; + Left->setPackingKind(PPK_Inconclusive); else if (HasMultipleParametersOnALine) - Left->PackingKind = PPK_BinPacked; + Left->setPackingKind(PPK_BinPacked); else - Left->PackingKind = PPK_OnePerLine; + Left->setPackingKind(PPK_OnePerLine); next(); return true; @@ -704,7 +704,7 @@ class AnnotatingParser { ScopedContextCreator ContextCreator(*this, tok::l_brace, 1); Contexts.back().ColonIsDictLiteral = true; - if (Left->BlockKind == BK_BracedInit) + if (Left->is(BK_BracedInit)) Contexts.back().IsExpression = true; if (Style.Language == FormatStyle::LK_JavaScript && Left->Previous && Left->Previous->is(TT_JsTypeColon)) @@ -751,7 +751,7 @@ class AnnotatingParser { // For ObjC methods, the number of parameters is calculated differently as // method declarations have a different structure (the parameters are not // inside a bracket scope). - if (Current->is(tok::l_brace) && Current->BlockKind == BK_Block) + if (Current->is(tok::l_brace) && Current->is(BK_Block)) ++Left->BlockParameterCount; if (Current->is(tok::comma)) { ++Left->ParameterCount; @@ -2420,7 +2420,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) { if (isFunctionDeclarationName(*Current, Line)) Current->setType(TT_FunctionDeclarationName); if (Current->is(TT_LineComment)) { - if (Current->Previous->BlockKind == BK_BracedInit && + if (Current->Previous->is(BK_BracedInit) && Current->Previous->opensScope()) Current->SpacesRequiredBefore = (Style.Cpp11BracedListStyle && !Style.SpacesInParentheses) ? 0 : 1; @@ -2755,8 +2755,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, if (Left.isOneOf(tok::hashhash, tok::hash)) return Right.is(tok::hash); if ((Left.is(tok::l_paren) && Right.is(tok::r_paren)) || - (Left.is(tok::l_brace) && Left.BlockKind != BK_Block && - Right.is(tok::r_brace) && Right.BlockKind != BK_Block)) + (Left.is(tok::l_brace) && Left.isNot(BK_Block) && + Right.is(tok::r_brace) && Right.isNot(BK_Block))) return Style.SpaceInEmptyParentheses; if (Style.SpacesInConditionalStatement) { if (Left.is(tok::l_paren) && Left.Previous && @@ -2836,7 +2836,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, return Right.Tok.isLiteral() || Right.is(TT_BlockComment) || (Right.isOneOf(Keywords.kw_override, Keywords.kw_final) && !Right.is(TT_StartOfName)) || - (Right.is(tok::l_brace) && Right.BlockKind == BK_Block) || + (Right.is(tok::l_brace) && Right.is(BK_Block)) || (!Right.isOneOf(TT_PointerOrReference, TT_ArraySubscriptLSquare, tok::l_paren) && (Style.PointerAlignment != FormatStyle::PAS_Right && @@ -2921,9 +2921,9 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, return false; if (Left.is(tok::l_brace) && Right.is(tok::r_brace)) return !Left.Children.empty(); // No spaces in "{}". - if ((Left.is(tok::l_brace) && Left.BlockKind != BK_Block) || + if ((Left.is(tok::l_brace) && Left.isNot(BK_Block)) || (Right.is(tok::r_brace) && Right.MatchingParen && - Right.MatchingParen->BlockKind != BK_Block)) + Right.MatchingParen->isNot(BK_Block))) return Style.Cpp11BracedListStyle ? Style.SpacesInParentheses : true; if (Left.is(TT_BlockComment)) // No whitespace in x(/*foo=*/1), except for JavaScript. @@ -2967,7 +2967,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, tok::r_paren) || Left.isSimpleTypeSpecifier()) && Right.is(tok::l_brace) && Right.getNextNonComment() && - Right.BlockKind != BK_Block) + Right.isNot(BK_Block)) return false; if (Left.is(tok::period) || Right.is(tok::period)) return false; @@ -3009,7 +3009,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, if (Style.isCpp()) { if (Left.is(tok::kw_operator)) return Right.is(tok::coloncolon); - if (Right.is(tok::l_brace) && Right.BlockKind == BK_BracedInit && + if (Right.is(tok::l_brace) && Right.is(BK_BracedInit) && !Left.opensScope() && Style.SpaceBeforeCpp11BracedList) return true; } else if (Style.Language == FormatStyle::LK_Proto || @@ -3362,7 +3362,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, // Returns 'true' if 'Tok' is a brace we'd want to break before in Allman style. static bool isAllmanBrace(const FormatToken &Tok) { - return Tok.is(tok::l_brace) && Tok.BlockKind == BK_Block && + return Tok.is(tok::l_brace) && Tok.is(BK_Block) && !Tok.isOneOf(TT_ObjCBlockLBrace, TT_LambdaLBrace, TT_DictLiteral); } @@ -3398,7 +3398,7 @@ static bool isOneChildWithoutMustBreakBefore(const FormatToken &Tok) { return true; } static bool isAllmanLambdaBrace(const FormatToken &Tok) { - return (Tok.is(tok::l_brace) && Tok.BlockKind == BK_Block && + return (Tok.is(tok::l_brace) && Tok.is(BK_Block) && !Tok.isOneOf(TT_ObjCBlockLBrace, TT_DictLiteral)); } @@ -3498,7 +3498,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, if ((Left.isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) || (Style.Language == FormatStyle::LK_JavaScript && Left.is(tok::l_paren))) && - Left.BlockKind != BK_Block && Left.MatchingParen) + Left.isNot(BK_Block) && Left.MatchingParen) BeforeClosingBrace = Left.MatchingParen->Previous; else if (Right.MatchingParen && (Right.MatchingParen->isOneOf(tok::l_brace, @@ -3512,8 +3512,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, } if (Right.is(tok::comment)) - return Left.BlockKind != BK_BracedInit && - Left.isNot(TT_CtorInitializerColon) && + return Left.isNot(BK_BracedInit) && Left.isNot(TT_CtorInitializerColon) && (Right.NewlinesBefore > 0 && Right.HasUnescapedNewline); if (Left.isTrailingComment()) return true; @@ -3822,7 +3821,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, // The first comment in a braced lists is always interpreted as belonging to // the first list element. Otherwise, it should be placed outside of the // list. - return Left.BlockKind == BK_BracedInit || + return Left.is(BK_BracedInit) || (Left.is(TT_CtorInitializerColon) && Style.BreakConstructorInitializers == FormatStyle::BCIS_AfterColon); if (Left.is(tok::question) && Right.is(tok::colon)) @@ -3923,7 +3922,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, // We only break before r_brace if there was a corresponding break before // the l_brace, which is tracked by BreakBeforeClosingBrace. if (Right.is(tok::r_brace)) - return Right.MatchingParen && Right.MatchingParen->BlockKind == BK_Block; + return Right.MatchingParen && Right.MatchingParen->is(BK_Block); // Allow breaking after a trailing annotation, e.g. after a method // declaration. @@ -4008,9 +4007,9 @@ void TokenAnnotator::printDebugInfo(const AnnotatedLine &Line) { << " T=" << getTokenTypeName(Tok->getType()) << " S=" << Tok->SpacesRequiredBefore << " F=" << Tok->Finalized << " B=" << Tok->BlockParameterCount - << " BK=" << Tok->BlockKind << " P=" << Tok->SplitPenalty + << " BK=" << Tok->getBlockKind() << " P=" << Tok->SplitPenalty << " Name=" << Tok->Tok.getName() << " L=" << Tok->TotalLength - << " PPK=" << Tok->PackingKind << " FakeLParens="; + << " PPK=" << Tok->getPackingKind() << " FakeLParens="; for (unsigned i = 0, e = Tok->FakeLParens.size(); i != e; ++i) llvm::errs() << Tok->FakeLParens[i] << "/"; llvm::errs() << " FakeRParens=" << Tok->FakeRParens; diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 22f27a668dccd..ec0c628fe750d 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -606,7 +606,7 @@ class LineJoiner { if (I[1]->Last->is(TT_LineComment)) return 0; do { - if (Tok->is(tok::l_brace) && Tok->BlockKind != BK_BracedInit) + if (Tok->is(tok::l_brace) && Tok->isNot(BK_BracedInit)) return 0; Tok = Tok->Next; } while (Tok); @@ -767,8 +767,8 @@ class LineFormatter { unsigned &Penalty) { const FormatToken *LBrace = State.NextToken->getPreviousNonComment(); FormatToken &Previous = *State.NextToken->Previous; - if (!LBrace || LBrace->isNot(tok::l_brace) || - LBrace->BlockKind != BK_Block || Previous.Children.size() == 0) + if (!LBrace || LBrace->isNot(tok::l_brace) || LBrace->isNot(BK_Block) || + Previous.Children.size() == 0) // The previous token does not open a block. Nothing to do. We don't // assert so that we can simply call this function for all tokens. return true; @@ -979,7 +979,7 @@ class OptimizingLineFormatter : public LineFormatter { // State already examined with lower penalty. continue; - FormatDecision LastFormat = Node->State.NextToken->Decision; + FormatDecision LastFormat = Node->State.NextToken->getDecision(); if (LastFormat == FD_Unformatted || LastFormat == FD_Continue) addNextStateToQueue(Penalty, Node, /*NewLine=*/false, &Count, &Queue); if (LastFormat == FD_Unformatted || LastFormat == FD_Break) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index a37386425aaed..b599168b48e17 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -472,19 +472,19 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { // individual members in a type member list, which would normally // trigger BK_Block. In both cases, this must be parsed as an inline // braced init. - Tok->BlockKind = BK_BracedInit; + Tok->setBlockKind(BK_BracedInit); else if (PrevTok->is(tok::r_paren)) // `) { }` can only occur in function or method declarations in JS. - Tok->BlockKind = BK_Block; + Tok->setBlockKind(BK_Block); } else { - Tok->BlockKind = BK_Unknown; + Tok->setBlockKind(BK_Unknown); } LBraceStack.push_back(Tok); break; case tok::r_brace: if (LBraceStack.empty()) break; - if (LBraceStack.back()->BlockKind == BK_Unknown) { + if (LBraceStack.back()->is(BK_Unknown)) { bool ProbablyBracedList = false; if (Style.Language == FormatStyle::LK_Proto) { ProbablyBracedList = NextTok->isOneOf(tok::comma, tok::r_square); @@ -524,11 +524,11 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { } } if (ProbablyBracedList) { - Tok->BlockKind = BK_BracedInit; - LBraceStack.back()->BlockKind = BK_BracedInit; + Tok->setBlockKind(BK_BracedInit); + LBraceStack.back()->setBlockKind(BK_BracedInit); } else { - Tok->BlockKind = BK_Block; - LBraceStack.back()->BlockKind = BK_Block; + Tok->setBlockKind(BK_Block); + LBraceStack.back()->setBlockKind(BK_Block); } } LBraceStack.pop_back(); @@ -545,8 +545,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { case tok::kw_switch: case tok::kw_try: case tok::kw___try: - if (!LBraceStack.empty() && LBraceStack.back()->BlockKind == BK_Unknown) - LBraceStack.back()->BlockKind = BK_Block; + if (!LBraceStack.empty() && LBraceStack.back()->is(BK_Unknown)) + LBraceStack.back()->setBlockKind(BK_Block); break; default: break; @@ -557,8 +557,8 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) { // Assume other blocks for all unclosed opening braces. for (unsigned i = 0, e = LBraceStack.size(); i != e; ++i) { - if (LBraceStack[i]->BlockKind == BK_Unknown) - LBraceStack[i]->BlockKind = BK_Block; + if (LBraceStack[i]->is(BK_Unknown)) + LBraceStack[i]->setBlockKind(BK_Block); } FormatTok = Tokens->setPosition(StoredPosition); @@ -584,7 +584,7 @@ void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel, assert(FormatTok->isOneOf(tok::l_brace, TT_MacroBlockBegin) && "'{' or macro block token expected"); const bool MacroBlock = FormatTok->is(TT_MacroBlockBegin); - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); size_t PPStartHash = computePPHash(); @@ -614,7 +614,7 @@ void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel, if (MacroBlock ? !FormatTok->is(TT_MacroBlockEnd) : !FormatTok->is(tok::r_brace)) { Line->Level = InitialLevel; - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); return; } @@ -690,7 +690,7 @@ static bool ShouldBreakBeforeBrace(const FormatStyle &Style, } void UnwrappedLineParser::parseChildBlock() { - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); nextToken(); { bool SkipIndent = (Style.Language == FormatStyle::LK_JavaScript && @@ -1476,7 +1476,7 @@ void UnwrappedLineParser::parseStructuralElement() { // C# needs this change to ensure that array initialisers and object // initialisers are indented the same way. if (Style.isCSharp()) - FormatTok->BlockKind = BK_BracedInit; + FormatTok->setBlockKind(BK_BracedInit); nextToken(); parseBracedList(); } else if (Style.Language == FormatStyle::LK_Proto && @@ -1747,10 +1747,10 @@ void UnwrappedLineParser::tryToParseJSFunction() { } bool UnwrappedLineParser::tryToParseBracedList() { - if (FormatTok->BlockKind == BK_Unknown) + if (FormatTok->is(BK_Unknown)) calculateBraceTypes(); - assert(FormatTok->BlockKind != BK_Unknown); - if (FormatTok->BlockKind == BK_Block) + assert(FormatTok->isNot(BK_Unknown)); + if (FormatTok->is(BK_Block)) return false; nextToken(); parseBracedList(); @@ -1830,7 +1830,7 @@ bool UnwrappedLineParser::parseBracedList(bool ContinueOnSemicolons, case tok::l_brace: // Assume there are no blocks inside a braced init list apart // from the ones we explicitly parse out (like lambdas). - FormatTok->BlockKind = BK_BracedInit; + FormatTok->setBlockKind(BK_BracedInit); nextToken(); parseBracedList(); break; @@ -2318,7 +2318,7 @@ bool UnwrappedLineParser::parseEnum() { // Just a declaration or something is wrong. if (FormatTok->isNot(tok::l_brace)) return true; - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); if (Style.Language == FormatStyle::LK_Java) { // Java enums are different. @@ -2726,7 +2726,7 @@ void UnwrappedLineParser::parseJavaScriptEs6ImportExport() { return; } if (FormatTok->is(tok::l_brace)) { - FormatTok->BlockKind = BK_Block; + FormatTok->setBlockKind(BK_Block); nextToken(); parseBracedList(); } else { diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index 32e0b685ea0f4..2d479817118db 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -49,7 +49,7 @@ void WhitespaceManager::replaceWhitespace(FormatToken &Tok, unsigned Newlines, bool IsAligned, bool InPPDirective) { if (Tok.Finalized) return; - Tok.Decision = (Newlines > 0) ? FD_Break : FD_Continue; + Tok.setDecision((Newlines > 0) ? FD_Break : FD_Continue); Changes.push_back(Change(Tok, /*CreateReplacement=*/true, Tok.WhitespaceRange, Spaces, StartOfTokenColumn, Newlines, "", "", IsAligned, InPPDirective && !Tok.IsFirst, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 4fed36005c884..a7b7dda26e5fa 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1159,8 +1159,7 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, if (const Arg *A = Args.getLastArg(OPT_compress_debug_sections, OPT_compress_debug_sections_EQ)) { if (A->getOption().getID() == OPT_compress_debug_sections) { - // TODO: be more clever about the compression type auto-detection - Opts.setCompressDebugSections(llvm::DebugCompressionType::GNU); + Opts.setCompressDebugSections(llvm::DebugCompressionType::Z); } else { auto DCT = llvm::StringSwitch(A->getValue()) .Case("none", llvm::DebugCompressionType::None) diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h index 7110404434405..2cf9cc7f1eb65 100644 --- a/clang/lib/Headers/__clang_hip_libdevice_declares.h +++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h @@ -10,7 +10,9 @@ #ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__ #define __CLANG_HIP_LIBDEVICE_DECLARES_H__ +#ifdef __cplusplus extern "C" { +#endif // BEGIN FLOAT __device__ __attribute__((const)) float __ocml_acos_f32(float); @@ -316,7 +318,7 @@ __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); __device__ inline __2f16 __llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL. { - return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)}; + return (__2f16){__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)}; } __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); @@ -325,6 +327,8 @@ __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16); +#ifdef __cplusplus } // extern "C" +#endif #endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__ diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h index 47d3c17175591..f9ca9bf606fb7 100644 --- a/clang/lib/Headers/__clang_hip_math.h +++ b/clang/lib/Headers/__clang_hip_math.h @@ -95,8 +95,10 @@ inline uint64_t __make_mantissa(const char *__tagp) { } // BEGIN FLOAT +#ifdef __cplusplus __DEVICE__ inline float abs(float __x) { return __ocml_fabs_f32(__x); } +#endif __DEVICE__ inline float acosf(float __x) { return __ocml_acos_f32(__x); } __DEVICE__ @@ -251,7 +253,7 @@ inline float nanf(const char *__tagp) { uint32_t sign : 1; } bits; - static_assert(sizeof(float) == sizeof(ieee_float), ""); + static_assert(sizeof(float) == sizeof(struct ieee_float), ""); } __tmp; __tmp.bits.sign = 0u; @@ -553,8 +555,10 @@ inline float __tanf(float __x) { return __ocml_tan_f32(__x); } // END FLOAT // BEGIN DOUBLE +#ifdef __cplusplus __DEVICE__ inline double abs(double __x) { return __ocml_fabs_f64(__x); } +#endif __DEVICE__ inline double acos(double __x) { return __ocml_acos_f64(__x); } __DEVICE__ @@ -712,7 +716,7 @@ inline double nan(const char *__tagp) { uint32_t exponent : 11; uint32_t sign : 1; } bits; - static_assert(sizeof(double) == sizeof(ieee_double), ""); + static_assert(sizeof(double) == sizeof(struct ieee_double), ""); } __tmp; __tmp.bits.sign = 0u; @@ -1178,6 +1182,7 @@ __host__ inline static int max(int __arg1, int __arg2) { return std::max(__arg1, __arg2); } +#ifdef __cplusplus __DEVICE__ inline float pow(float __base, int __iexp) { return powif(__base, __iexp); } @@ -1188,6 +1193,7 @@ __DEVICE__ inline _Float16 pow(_Float16 __base, int __iexp) { return __ocml_pown_f16(__base, __iexp); } +#endif #pragma pop_macro("__DEF_FUN1") #pragma pop_macro("__DEF_FUN2") diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h index ac5f43836316e..f42200f5bd4e7 100644 --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -16597,6 +16597,58 @@ static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec, } #endif +/* vec_xst_trunc */ + +#if defined(__POWER10_VECTOR__) && defined(__VSX__) +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed char *__ptr) { + *(__ptr + __offset) = (signed char)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned char *__ptr) { + *(__ptr + __offset) = (unsigned char)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed short *__ptr) { + *(__ptr + __offset) = (signed short)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned short *__ptr) { + *(__ptr + __offset) = (unsigned short)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed int *__ptr) { + *(__ptr + __offset) = (signed int)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned int *__ptr) { + *(__ptr + __offset) = (unsigned int)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed long long *__ptr) { + *(__ptr + __offset) = (signed long long)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned long long *__ptr) { + *(__ptr + __offset) = (unsigned long long)__vec[0]; +} +#endif + /* vec_xst_be */ #ifdef __LITTLE_ENDIAN__ @@ -16881,6 +16933,28 @@ vec_cnttzm(vector unsigned long long __a, vector unsigned long long __b) { return __builtin_altivec_vctzdm(__a, __b); } +/* vec_mod */ + +static __inline__ vector signed int __ATTRS_o_ai +vec_mod(vector signed int __a, vector signed int __b) { + return __a % __b; +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_mod(vector unsigned int __a, vector unsigned int __b) { + return __a % __b; +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_mod(vector signed long long __a, vector signed long long __b) { + return __a % __b; +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_mod(vector unsigned long long __a, vector unsigned long long __b) { + return __a % __b; +} + /* vec_sldbi */ #define vec_sldb(__a, __b, __c) __builtin_altivec_vsldbi(__a, __b, (__c & 0x7)) diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index b78123834b643..967008b555f42 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -35,6 +35,13 @@ typedef unsigned long long __u64x2 typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16))); typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16))); +typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8))); +typedef unsigned char __u8x8 + __attribute__((__vector_size__(8), __aligned__(8))); +typedef short __i16x4 __attribute__((__vector_size__(8), __aligned__(8))); +typedef unsigned short __u16x4 + __attribute__((__vector_size__(8), __aligned__(8))); + #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("simd128"), \ __min_vector_width__(128))) @@ -1089,42 +1096,70 @@ wasm_u16x8_narrow_i32x4(v128_t __a, v128_t __b) { static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_low_i8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_s_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__i8x8){((__i8x16)__a)[0], ((__i8x16)__a)[1], ((__i8x16)__a)[2], + ((__i8x16)__a)[3], ((__i8x16)__a)[4], ((__i8x16)__a)[5], + ((__i8x16)__a)[6], ((__i8x16)__a)[7]}, + __i16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_high_i8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_s_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__i8x8){((__i8x16)__a)[8], ((__i8x16)__a)[9], ((__i8x16)__a)[10], + ((__i8x16)__a)[11], ((__i8x16)__a)[12], ((__i8x16)__a)[13], + ((__i8x16)__a)[14], ((__i8x16)__a)[15]}, + __i16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_low_u8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_u_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__u8x8){((__u8x16)__a)[0], ((__u8x16)__a)[1], ((__u8x16)__a)[2], + ((__u8x16)__a)[3], ((__u8x16)__a)[4], ((__u8x16)__a)[5], + ((__u8x16)__a)[6], ((__u8x16)__a)[7]}, + __u16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_high_u8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_u_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__u8x8){((__u8x16)__a)[8], ((__u8x16)__a)[9], ((__u8x16)__a)[10], + ((__u8x16)__a)[11], ((__u8x16)__a)[12], ((__u8x16)__a)[13], + ((__u8x16)__a)[14], ((__u8x16)__a)[15]}, + __u16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_low_i16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_s_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__i16x4){((__i16x8)__a)[0], ((__i16x8)__a)[1], ((__i16x8)__a)[2], + ((__i16x8)__a)[3]}, + __i32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_high_i16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_s_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__i16x4){((__i16x8)__a)[4], ((__i16x8)__a)[5], ((__i16x8)__a)[6], + ((__i16x8)__a)[7]}, + __i32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_low_u16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_u_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__u16x4){((__u16x8)__a)[0], ((__u16x8)__a)[1], ((__u16x8)__a)[2], + ((__u16x8)__a)[3]}, + __u32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_high_u16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_u_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__u16x4){((__u16x8)__a)[4], ((__u16x8)__a)[5], ((__u16x8)__a)[6], + ((__u16x8)__a)[7]}, + __u32x4); } // Undefine helper macros diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 160e2b6ed8846..58c28cea30a18 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -969,8 +969,10 @@ void Preprocessor::Lex(Token &Result) { LastTokenWasAt = Result.is(tok::at); --LexLevel; - if (LexLevel == 0 && !Result.getFlag(Token::IsReinjected)) { - ++TokenCount; + if ((LexLevel == 0 || PreprocessToken) && + !Result.getFlag(Token::IsReinjected)) { + if (LexLevel == 0) + ++TokenCount; if (OnToken) OnToken(Result); } diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index b694fcfa460fb..148f23c41a49a 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -2838,7 +2838,7 @@ Parser::DiagnoseMissingSemiAfterTagDefinition(DeclSpec &DS, AccessSpecifier AS, case Sema::NC_Unknown: case Sema::NC_NonType: case Sema::NC_DependentNonType: - case Sema::NC_ContextIndependentExpr: + case Sema::NC_OverloadSet: case Sema::NC_VarTemplate: case Sema::NC_FunctionTemplate: case Sema::NC_Concept: diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 81e87582c6ade..4f662f00e1df3 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -1007,23 +1007,11 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, Res = Actions.ActOnCXXNullPtrLiteral(ConsumeToken()); break; - case tok::annot_uneval_primary_expr: case tok::annot_primary_expr: + case tok::annot_overload_set: Res = getExprAnnotation(Tok); - if (SavedKind == tok::annot_uneval_primary_expr) { - if (Expr *E = Res.get()) { - if (!E->isTypeDependent() && !E->containsErrors()) { - // TransformToPotentiallyEvaluated expects that it will still be in a - // (temporary) unevaluated context and then looks through that context - // to build it in the surrounding context. So we need to push an - // unevaluated context to balance things out. - EnterExpressionEvaluationContext Unevaluated( - Actions, Sema::ExpressionEvaluationContext::Unevaluated, - Sema::ReuseLambdaContextDecl); - Res = Actions.TransformToPotentiallyEvaluated(Res.get()); - } - } - } + if (!Res.isInvalid() && Tok.getKind() == tok::annot_overload_set) + Res = Actions.ActOnNameClassifiedAsOverloadSet(getCurScope(), Res.get()); ConsumeAnnotationToken(); if (!Res.isInvalid() && Tok.is(tok::less)) checkPotentialAngleBracket(Res); diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 79744a7519e04..f24cb3d3f0b7d 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -869,7 +869,8 @@ void Parser::parseOMPTraitPropertyKind( return; } - TIProperty.Kind = getOpenMPContextTraitPropertyKind(Set, Name); + TIProperty.RawString = Name; + TIProperty.Kind = getOpenMPContextTraitPropertyKind(Set, Selector, Name); if (TIProperty.Kind != TraitProperty::invalid) { if (checkForDuplicates(*this, Name, NameLoc, Seen, CONTEXT_TRAIT_LVL)) TIProperty.Kind = TraitProperty::invalid; @@ -910,7 +911,7 @@ void Parser::parseOMPTraitPropertyKind( {TraitSet::construct, TraitSet::user, TraitSet::implementation, TraitSet::device}) { TraitProperty PropertyForName = - getOpenMPContextTraitPropertyKind(PotentialSet, Name); + getOpenMPContextTraitPropertyKind(PotentialSet, Selector, Name); if (PropertyForName == TraitProperty::invalid) continue; Diag(NameLoc, diag::note_omp_declare_variant_ctx_try) @@ -949,8 +950,8 @@ static bool checkExtensionProperty(Parser &P, SourceLocation Loc, for (OMPTraitProperty &SeenProp : TISelector.Properties) if (IsMatchExtension(SeenProp)) { P.Diag(Loc, diag::err_omp_variant_ctx_second_match_extension); - StringRef SeenName = - llvm::omp::getOpenMPContextTraitPropertyName(SeenProp.Kind); + StringRef SeenName = llvm::omp::getOpenMPContextTraitPropertyName( + SeenProp.Kind, SeenProp.RawString); SourceLocation SeenLoc = Seen[SeenName]; P.Diag(SeenLoc, diag::note_omp_declare_variant_ctx_used_here) << CONTEXT_TRAIT_LVL << SeenName; @@ -995,11 +996,13 @@ void Parser::parseOMPContextProperty(OMPTraitSelector &TISelector, } Diag(PropertyLoc, diag::warn_omp_ctx_incompatible_property_for_selector) - << getOpenMPContextTraitPropertyName(TIProperty.Kind) + << getOpenMPContextTraitPropertyName(TIProperty.Kind, + TIProperty.RawString) << getOpenMPContextTraitSelectorName(TISelector.Kind) << getOpenMPContextTraitSetName(Set); Diag(PropertyLoc, diag::note_omp_ctx_compatible_set_and_selector_for_property) - << getOpenMPContextTraitPropertyName(TIProperty.Kind) + << getOpenMPContextTraitPropertyName(TIProperty.Kind, + TIProperty.RawString) << getOpenMPContextTraitSelectorName( getOpenMPContextTraitSelectorForProperty(TIProperty.Kind)) << getOpenMPContextTraitSetName( @@ -1045,8 +1048,8 @@ void Parser::parseOMPTraitSelectorKind( for (const auto &PotentialSet : {TraitSet::construct, TraitSet::user, TraitSet::implementation, TraitSet::device}) { - TraitProperty PropertyForName = - getOpenMPContextTraitPropertyKind(PotentialSet, Name); + TraitProperty PropertyForName = getOpenMPContextTraitPropertyKind( + PotentialSet, TraitSelector::invalid, Name); if (PropertyForName == TraitProperty::invalid) continue; Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a) @@ -1140,7 +1143,8 @@ void Parser::parseOMPContextSelector( if (!RequiresProperty) { TISelector.Properties.push_back( - {getOpenMPContextTraitPropertyForSelector(TISelector.Kind)}); + {getOpenMPContextTraitPropertyForSelector(TISelector.Kind), + getOpenMPContextTraitSelectorName(TISelector.Kind)}); return; } @@ -1157,7 +1161,8 @@ void Parser::parseOMPContextSelector( if (!Condition.isUsable()) return FinishSelector(); TISelector.ScoreOrCondition = Condition.get(); - TISelector.Properties.push_back({TraitProperty::user_condition_unknown}); + TISelector.Properties.push_back( + {TraitProperty::user_condition_unknown, ""}); return; } @@ -1236,8 +1241,8 @@ void Parser::parseOMPTraitSetKind(OMPTraitSet &TISet, for (const auto &PotentialSet : {TraitSet::construct, TraitSet::user, TraitSet::implementation, TraitSet::device}) { - TraitProperty PropertyForName = - getOpenMPContextTraitPropertyKind(PotentialSet, Name); + TraitProperty PropertyForName = getOpenMPContextTraitPropertyKind( + PotentialSet, TraitSelector::invalid, Name); if (PropertyForName == TraitProperty::invalid) continue; Diag(NameLoc, diag::note_omp_declare_variant_ctx_is_a) @@ -1820,8 +1825,15 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( VariantMatchInfo VMI; ASTContext &ASTCtx = Actions.getASTContext(); TI.getAsVariantMatchInfo(ASTCtx, VMI); - OMPContext OMPCtx(ASTCtx.getLangOpts().OpenMPIsDevice, - ASTCtx.getTargetInfo().getTriple()); + + std::function DiagUnknownTrait = [this, Loc]( + StringRef ISATrait) { + // TODO Track the selector locations in a way that is accessible here to + // improve the diagnostic location. + Diag(Loc, diag::warn_unknown_begin_declare_variant_isa_trait) << ISATrait; + }; + TargetOMPContext OMPCtx(ASTCtx, std::move(DiagUnknownTrait), + /* CurrentFunctionDecl */ nullptr); if (isVariantApplicableInContext(VMI, OMPCtx, /* DeviceSetOnly */ true)) { Actions.ActOnOpenMPBeginDeclareVariant(Loc, TI); @@ -3179,7 +3191,6 @@ static void parseMapType(Parser &P, Parser::OpenMPVarListDataTy &Data) { /// Parses simple expression in parens for single-expression clauses of OpenMP /// constructs. -/// \param RLoc Returned location of right paren. ExprResult Parser::ParseOpenMPIteratorsExpr() { assert(Tok.is(tok::identifier) && PP.getSpelling(Tok) == "iterator" && "Expected 'iterator' token."); @@ -3441,36 +3452,43 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, if (Tok.is(tok::colon)) Data.ColonLoc = ConsumeToken(); } else if (Kind == OMPC_to || Kind == OMPC_from) { - if (Tok.is(tok::identifier)) { - bool IsMapperModifier = false; - if (Kind == OMPC_to) { - auto Modifier = - static_cast(getOpenMPSimpleClauseType( - Kind, PP.getSpelling(Tok), getLangOpts().OpenMP)); - if (Modifier == OMPC_TO_MODIFIER_mapper) - IsMapperModifier = true; - } else { - auto Modifier = - static_cast(getOpenMPSimpleClauseType( - Kind, PP.getSpelling(Tok), getLangOpts().OpenMP)); - if (Modifier == OMPC_FROM_MODIFIER_mapper) - IsMapperModifier = true; + while (Tok.is(tok::identifier)) { + auto Modifier = + static_cast(getOpenMPSimpleClauseType( + Kind, PP.getSpelling(Tok), getLangOpts().OpenMP)); + if (Modifier == OMPC_MOTION_MODIFIER_unknown) + break; + Data.MotionModifiers.push_back(Modifier); + Data.MotionModifiersLoc.push_back(Tok.getLocation()); + ConsumeToken(); + if (Modifier == OMPC_MOTION_MODIFIER_mapper) { + IsInvalidMapperModifier = parseMapperModifier(Data); + if (IsInvalidMapperModifier) + break; } - if (IsMapperModifier) { - // Parse the mapper modifier. + // OpenMP < 5.1 doesn't permit a ',' or additional modifiers. + if (getLangOpts().OpenMP < 51) + break; + // OpenMP 5.1 accepts an optional ',' even if the next character is ':'. + // TODO: Is that intentional? + if (Tok.is(tok::comma)) ConsumeToken(); - IsInvalidMapperModifier = parseMapperModifier(Data); - if (Tok.isNot(tok::colon)) { - if (!IsInvalidMapperModifier) - Diag(Tok, diag::warn_pragma_expected_colon) << ")"; - SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end, - StopBeforeMatch); - } - // Consume ':'. - if (Tok.is(tok::colon)) - ConsumeToken(); + } + if (!Data.MotionModifiers.empty() && Tok.isNot(tok::colon)) { + if (!IsInvalidMapperModifier) { + if (getLangOpts().OpenMP < 51) + Diag(Tok, diag::warn_pragma_expected_colon) << ")"; + else + Diag(Tok, diag::warn_pragma_expected_colon) << "motion modifier"; } + SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end, + StopBeforeMatch); } + // OpenMP 5.1 permits a ':' even without a preceding modifier. TODO: Is + // that intentional? + if ((!Data.MotionModifiers.empty() || getLangOpts().OpenMP >= 51) && + Tok.is(tok::colon)) + Data.ColonLoc = ConsumeToken(); } else if (Kind == OMPC_allocate || (Kind == OMPC_affinity && Tok.is(tok::identifier) && PP.getSpelling(Tok) == "iterator")) { @@ -3646,6 +3664,7 @@ OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind, Kind, Vars, Data.DepModOrTailExpr, Locs, Data.ColonLoc, Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId, Data.ExtraModifier, Data.MapTypeModifiers, Data.MapTypeModifiersLoc, - Data.IsMapTypeImplicit, Data.ExtraModifierLoc); + Data.IsMapTypeImplicit, Data.ExtraModifierLoc, Data.MotionModifiers, + Data.MotionModifiersLoc); } diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp index f026f3a1bfb29..d0f1d2e09a872 100644 --- a/clang/lib/Parse/ParseTentative.cpp +++ b/clang/lib/Parse/ParseTentative.cpp @@ -1276,15 +1276,6 @@ Parser::isCXXDeclarationSpecifier(Parser::TPResult BracedCastResult, // this is ambiguous. Typo-correct to type and expression keywords and // to types and identifiers, in order to try to recover from errors. TentativeParseCCC CCC(Next); - // Tentative parsing may not be done in the right evaluation context - // for the ultimate expression. Enter an unevaluated context to prevent - // Sema from immediately e.g. treating this lookup as a potential ODR-use. - // If we generate an expression annotation token and the parser actually - // claims it as an expression, we'll transform the expression to a - // potentially-evaluated one then. - EnterExpressionEvaluationContext Unevaluated( - Actions, Sema::ExpressionEvaluationContext::Unevaluated, - Sema::ReuseLambdaContextDecl); switch (TryAnnotateName(&CCC)) { case ANK_Error: return TPResult::Error; diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 764d4e8e9d522..45cf855cf8c9b 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -1691,9 +1691,8 @@ Parser::TryAnnotateName(CorrectionCandidateCallback *CCC) { return ANK_Success; } - case Sema::NC_ContextIndependentExpr: - Tok.setKind(Actions.isUnevaluatedContext() ? tok::annot_uneval_primary_expr - : tok::annot_primary_expr); + case Sema::NC_OverloadSet: + Tok.setKind(tok::annot_overload_set); setExprAnnotation(Tok, Classification.getExpression()); Tok.setAnnotationEndLoc(NameLoc); if (SS.isNotEmpty()) diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt index c135b2b46abfe..927e8f5a6608d 100644 --- a/clang/lib/Sema/CMakeLists.txt +++ b/clang/lib/Sema/CMakeLists.txt @@ -3,16 +3,6 @@ set(LLVM_LINK_COMPONENTS Support ) -if (MSVC) - set_source_files_properties(SemaDeclAttr.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(SemaExpr.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(SemaExprCXX.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(SemaSYCL.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(SemaTemplate.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(SemaTemplateDeduction.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(SemaOpenMP.cpp PROPERTIES COMPILE_FLAGS /bigobj) -endif() - clang_tablegen(OpenCLBuiltins.inc -gen-clang-opencl-builtins SOURCE OpenCLBuiltins.td TARGET ClangOpenCLBuiltinsImpl diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 0f351dfc9cc8e..046571e25e441 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -1234,7 +1234,7 @@ void Sema::ActOnEndOfTranslationUnit() { if (DiagD->isReferenced()) { if (isa(DiagD)) Diag(DiagD->getLocation(), diag::warn_unneeded_member_function) - << DiagD->getDeclName(); + << DiagD; else { if (FD->getStorageClass() == SC_Static && !FD->isInlineSpecified() && @@ -1242,20 +1242,20 @@ void Sema::ActOnEndOfTranslationUnit() { SourceMgr.getExpansionLoc(FD->getLocation()))) Diag(DiagD->getLocation(), diag::warn_unneeded_static_internal_decl) - << DiagD->getDeclName(); + << DiagD; else Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl) - << /*function*/0 << DiagD->getDeclName(); + << /*function*/ 0 << DiagD; } } else { if (FD->getDescribedFunctionTemplate()) Diag(DiagD->getLocation(), diag::warn_unused_template) - << /*function*/0 << DiagD->getDeclName(); + << /*function*/ 0 << DiagD; else - Diag(DiagD->getLocation(), - isa(DiagD) ? diag::warn_unused_member_function + Diag(DiagD->getLocation(), isa(DiagD) + ? diag::warn_unused_member_function : diag::warn_unused_function) - << DiagD->getDeclName(); + << DiagD; } } else { const VarDecl *DiagD = cast(*I)->getDefinition(); @@ -1263,20 +1263,19 @@ void Sema::ActOnEndOfTranslationUnit() { DiagD = cast(*I); if (DiagD->isReferenced()) { Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl) - << /*variable*/1 << DiagD->getDeclName(); + << /*variable*/ 1 << DiagD; } else if (DiagD->getType().isConstQualified()) { const SourceManager &SM = SourceMgr; if (SM.getMainFileID() != SM.getFileID(DiagD->getLocation()) || !PP.getLangOpts().IsHeaderFile) Diag(DiagD->getLocation(), diag::warn_unused_const_variable) - << DiagD->getDeclName(); + << DiagD; } else { if (DiagD->getDescribedVarTemplate()) Diag(DiagD->getLocation(), diag::warn_unused_template) - << /*variable*/1 << DiagD->getDeclName(); + << /*variable*/ 1 << DiagD; else - Diag(DiagD->getLocation(), diag::warn_unused_variable) - << DiagD->getDeclName(); + Diag(DiagD->getLocation(), diag::warn_unused_variable) << DiagD; } } } diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 2efe26052c78c..58cf3a1be7301 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -510,12 +510,10 @@ static void diagnoseBadCast(Sema &S, unsigned msg, CastType castType, if (RecFrom && RecTo) { auto DeclFrom = RecFrom->getAsCXXRecordDecl(); if (!DeclFrom->isCompleteDefinition()) - S.Diag(DeclFrom->getLocation(), diag::note_type_incomplete) - << DeclFrom->getDeclName(); + S.Diag(DeclFrom->getLocation(), diag::note_type_incomplete) << DeclFrom; auto DeclTo = RecTo->getAsCXXRecordDecl(); if (!DeclTo->isCompleteDefinition()) - S.Diag(DeclTo->getLocation(), diag::note_type_incomplete) - << DeclTo->getDeclName(); + S.Diag(DeclTo->getLocation(), diag::note_type_incomplete) << DeclTo; } } } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index fa383bcd89a16..14da704ad3e17 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -1274,11 +1274,8 @@ static bool SemaBuiltinPipePackets(Sema &S, CallExpr *Call) { // \return True if a semantic error has been found, false otherwise. static bool SemaOpenCLBuiltinToAddr(Sema &S, unsigned BuiltinID, CallExpr *Call) { - if (Call->getNumArgs() != 1) { - S.Diag(Call->getBeginLoc(), diag::err_opencl_builtin_to_addr_arg_num) - << Call->getDirectCallee() << Call->getSourceRange(); + if (checkArgCount(S, Call, 1)) return true; - } auto RT = Call->getArg(0)->getType(); if (!RT->isPointerType() || RT->getPointeeType() @@ -5708,21 +5705,8 @@ bool Sema::SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall) { if (checkVAStartABI(*this, BuiltinID, Fn)) return true; - if (TheCall->getNumArgs() > 2) { - Diag(TheCall->getArg(2)->getBeginLoc(), - diag::err_typecheck_call_too_many_args) - << 0 /*function call*/ << 2 << TheCall->getNumArgs() - << Fn->getSourceRange() - << SourceRange(TheCall->getArg(2)->getBeginLoc(), - (*(TheCall->arg_end() - 1))->getEndLoc()); + if (checkArgCount(*this, TheCall, 2)) return true; - } - - if (TheCall->getNumArgs() < 2) { - return Diag(TheCall->getEndLoc(), - diag::err_typecheck_call_too_few_args_at_least) - << 0 /*function call*/ << 2 << TheCall->getNumArgs(); - } // Type-check the first argument normally. if (checkBuiltinArgument(*this, TheCall, 0)) @@ -5832,15 +5816,8 @@ bool Sema::SemaBuiltinVAStartARMMicrosoft(CallExpr *Call) { /// SemaBuiltinUnorderedCompare - Handle functions like __builtin_isgreater and /// friends. This is declared to take (...), so we have to check everything. bool Sema::SemaBuiltinUnorderedCompare(CallExpr *TheCall) { - if (TheCall->getNumArgs() < 2) - return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args) - << 0 << 2 << TheCall->getNumArgs() /*function call*/; - if (TheCall->getNumArgs() > 2) - return Diag(TheCall->getArg(2)->getBeginLoc(), - diag::err_typecheck_call_too_many_args) - << 0 /*function call*/ << 2 << TheCall->getNumArgs() - << SourceRange(TheCall->getArg(2)->getBeginLoc(), - (*(TheCall->arg_end() - 1))->getEndLoc()); + if (checkArgCount(*this, TheCall, 2)) + return true; ExprResult OrigArg0 = TheCall->getArg(0); ExprResult OrigArg1 = TheCall->getArg(1); @@ -5878,15 +5855,8 @@ bool Sema::SemaBuiltinUnorderedCompare(CallExpr *TheCall) { /// to check everything. We expect the last argument to be a floating point /// value. bool Sema::SemaBuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs) { - if (TheCall->getNumArgs() < NumArgs) - return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args) - << 0 << NumArgs << TheCall->getNumArgs() /*function call*/; - if (TheCall->getNumArgs() > NumArgs) - return Diag(TheCall->getArg(NumArgs)->getBeginLoc(), - diag::err_typecheck_call_too_many_args) - << 0 /*function call*/ << NumArgs << TheCall->getNumArgs() - << SourceRange(TheCall->getArg(NumArgs)->getBeginLoc(), - (*(TheCall->arg_end() - 1))->getEndLoc()); + if (checkArgCount(*this, TheCall, NumArgs)) + return true; // __builtin_fpclassify is the only case where NumArgs != 1, so we can count // on all preceding parameters just being int. Try all of those. @@ -5990,17 +5960,8 @@ bool Sema::SemaBuiltinComplex(CallExpr *TheCall) { // vector short vec_xxsldwi(vector short, vector short, int); bool Sema::SemaBuiltinVSX(CallExpr *TheCall) { unsigned ExpectedNumArgs = 3; - if (TheCall->getNumArgs() < ExpectedNumArgs) - return Diag(TheCall->getEndLoc(), - diag::err_typecheck_call_too_few_args_at_least) - << 0 /*function call*/ << ExpectedNumArgs << TheCall->getNumArgs() - << TheCall->getSourceRange(); - - if (TheCall->getNumArgs() > ExpectedNumArgs) - return Diag(TheCall->getEndLoc(), - diag::err_typecheck_call_too_many_args_at_most) - << 0 /*function call*/ << ExpectedNumArgs << TheCall->getNumArgs() - << TheCall->getSourceRange(); + if (checkArgCount(*this, TheCall, ExpectedNumArgs)) + return true; // Check the third argument is a compile time constant if (!TheCall->getArg(2)->isIntegerConstantExpr(Context)) @@ -14098,8 +14059,7 @@ void Sema::CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr, if (ND) DiagRuntimeBehavior(ND->getBeginLoc(), BaseExpr, - PDiag(diag::note_array_declared_here) - << ND->getDeclName()); + PDiag(diag::note_array_declared_here) << ND); } void Sema::CheckArrayAccess(const Expr *expr) { diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp old mode 100755 new mode 100644 diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp index 992cccac64050..990ab26335209 100644 --- a/clang/lib/Sema/SemaCoroutine.cpp +++ b/clang/lib/Sema/SemaCoroutine.cpp @@ -449,7 +449,8 @@ static ReadySuspendResumeResult buildCoawaitCalls(Sema &S, VarDecl *CoroPromise, if (!AwaitSuspend->getType()->isDependentType()) { // [expr.await]p3 [...] // - await-suspend is the expression e.await_suspend(h), which shall be - // a prvalue of type void or bool. + // a prvalue of type void, bool, or std::coroutine_handle for some + // type Z. QualType RetType = AwaitSuspend->getCallReturnType(S.Context); // Experimental support for coroutine_handle returning await_suspend. diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 73150e03fea5f..d322e6dbe4438 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -1184,23 +1184,20 @@ Sema::NameClassification Sema::ClassifyName(Scope *S, CXXScopeSpec &SS, return ParsedType::make(T); } - // FIXME: This is context-dependent. We need to defer building the member - // expression until the classification is consumed. - if (FirstDecl->isCXXClassMember()) - return NameClassification::ContextIndependentExpr( - BuildPossibleImplicitMemberExpr(SS, SourceLocation(), Result, nullptr, - S)); - // If we already know which single declaration is referenced, just annotate - // that declaration directly. + // that declaration directly. Defer resolving even non-overloaded class + // member accesses, as we need to defer certain access checks until we know + // the context. bool ADL = UseArgumentDependentLookup(SS, Result, NextToken.is(tok::l_paren)); - if (Result.isSingleResult() && !ADL) + if (Result.isSingleResult() && !ADL && !FirstDecl->isCXXClassMember()) return NameClassification::NonType(Result.getRepresentativeDecl()); - // Build an UnresolvedLookupExpr. Note that this doesn't depend on the - // context in which we performed classification, so it's safe to do now. - return NameClassification::ContextIndependentExpr( - BuildDeclarationNameExpr(SS, Result, ADL)); + // Otherwise, this is an overload set that we will need to resolve later. + Result.suppressDiagnostics(); + return NameClassification::OverloadSet(UnresolvedLookupExpr::Create( + Context, Result.getNamingClass(), SS.getWithLocInContext(Context), + Result.getLookupNameInfo(), ADL, Result.isOverloadedResult(), + Result.begin(), Result.end())); } ExprResult @@ -1240,6 +1237,30 @@ ExprResult Sema::ActOnNameClassifiedAsNonType(Scope *S, const CXXScopeSpec &SS, return BuildDeclarationNameExpr(SS, Result, ADL); } +ExprResult Sema::ActOnNameClassifiedAsOverloadSet(Scope *S, Expr *E) { + // For an implicit class member access, transform the result into a member + // access expression if necessary. + auto *ULE = cast(E); + if ((*ULE->decls_begin())->isCXXClassMember()) { + CXXScopeSpec SS; + SS.Adopt(ULE->getQualifierLoc()); + + // Reconstruct the lookup result. + LookupResult Result(*this, ULE->getName(), ULE->getNameLoc(), + LookupOrdinaryName); + Result.setNamingClass(ULE->getNamingClass()); + for (auto I = ULE->decls_begin(), E = ULE->decls_end(); I != E; ++I) + Result.addDecl(*I, I.getAccess()); + Result.resolveKind(); + return BuildPossibleImplicitMemberExpr(SS, SourceLocation(), Result, + nullptr, S); + } + + // Otherwise, this is already in the form we needed, and no further checks + // are necessary. + return ULE; +} + Sema::TemplateNameKindForDiagnostics Sema::getTemplateNameKindForDiagnostics(TemplateName Name) { auto *TD = Name.getAsTemplateDecl(); @@ -1905,7 +1926,7 @@ static void CheckPoppedLabel(LabelDecl *L, Sema &S) { else Diagnose = L->getStmt() == nullptr; if (Diagnose) - S.Diag(L->getLocation(), diag::err_undeclared_label_use) <getDeclName(); + S.Diag(L->getLocation(), diag::err_undeclared_label_use) << L; } void Sema::ActOnPopScope(SourceLocation Loc, Scope *S) { @@ -4745,7 +4766,10 @@ Sema::ParsedFreeStandingDeclSpec(Scope *S, AccessSpecifier AS, DeclSpec &DS, if (!DeclaresAnything) { // In C, we allow this as a (popular) extension / bug. Don't bother // producing further diagnostics for redundant qualifiers after this. - Diag(DS.getBeginLoc(), diag::ext_no_declarators) << DS.getSourceRange(); + Diag(DS.getBeginLoc(), (IsExplicitInstantiation || !TemplateParams.empty()) + ? diag::err_no_declarators + : diag::ext_no_declarators) + << DS.getSourceRange(); return TagD; } @@ -6969,19 +6993,18 @@ NamedDecl *Sema::ActOnVariableDeclarator( TemplateParams->getRAngleLoc()); TemplateParams = nullptr; } else { + // Check that we can declare a template here. + if (CheckTemplateDeclScope(S, TemplateParams)) + return nullptr; + if (D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId) { // This is an explicit specialization or a partial specialization. - // FIXME: Check that we can declare a specialization here. IsVariableTemplateSpecialization = true; IsPartialSpecialization = TemplateParams->size() > 0; } else { // if (TemplateParams->size() > 0) // This is a template declaration. IsVariableTemplate = true; - // Check that we can declare a template here. - if (CheckTemplateDeclScope(S, TemplateParams)) - return nullptr; - // Only C++1y supports variable templates (N3651). Diag(D.getIdentifierLoc(), getLangOpts().CPlusPlus14 @@ -6990,6 +7013,10 @@ NamedDecl *Sema::ActOnVariableDeclarator( } } } else { + // Check that we can declare a member specialization here. + if (!TemplateParamLists.empty() && IsMemberSpecialization && + CheckTemplateDeclScope(S, TemplateParamLists.back())) + return nullptr; assert((Invalid || D.getName().getKind() != UnqualifiedIdKind::IK_TemplateId) && "should have a 'template<>' for this decl"); @@ -7179,9 +7206,10 @@ NamedDecl *Sema::ActOnVariableDeclarator( << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); else if (NewVD->hasLocalStorage()) Diag(NewVD->getLocation(), diag::err_module_private_local) - << 0 << NewVD->getDeclName() - << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) - << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); + << 0 << NewVD + << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) + << FixItHint::CreateRemoval( + D.getDeclSpec().getModulePrivateSpecLoc()); else { NewVD->setModulePrivate(); if (NewTemplate) @@ -8922,13 +8950,13 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, TemplateParamLists, isFriend, isMemberSpecialization, Invalid); if (TemplateParams) { + // Check that we can declare a template here. + if (CheckTemplateDeclScope(S, TemplateParams)) + NewFD->setInvalidDecl(); + if (TemplateParams->size() > 0) { // This is a function template - // Check that we can declare a template here. - if (CheckTemplateDeclScope(S, TemplateParams)) - NewFD->setInvalidDecl(); - // A destructor cannot be a template. if (Name.getNameKind() == DeclarationName::CXXDestructorName) { Diag(NewFD->getLocation(), diag::err_destructor_template); @@ -8987,6 +9015,11 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, } } } else { + // Check that we can declare a template here. + if (!TemplateParamLists.empty() && isMemberSpecialization && + CheckTemplateDeclScope(S, TemplateParamLists.back())) + NewFD->setInvalidDecl(); + // All template param lists were matched against the scope specifier: // this is NOT (an explicit specialization of) a template. if (TemplateParamLists.size() > 0) @@ -12417,7 +12450,7 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { !Context.getTargetInfo().getCXXABI().isMicrosoft()) { Diag(Var->getLocation(), diag::err_constexpr_static_mem_var_requires_init) - << Var->getDeclName(); + << Var; Var->setInvalidDecl(); return; } @@ -12550,8 +12583,7 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { // definitions with reference type. if (Type->isReferenceType()) { Diag(Var->getLocation(), diag::err_reference_var_requires_init) - << Var->getDeclName() - << SourceRange(Var->getLocation(), Var->getLocation()); + << Var << SourceRange(Var->getLocation(), Var->getLocation()); Var->setInvalidDecl(); return; } @@ -12689,7 +12721,7 @@ void Sema::ActOnCXXForRangeDecl(Decl *D) { } if (Error != -1) { Diag(VD->getOuterLocStart(), diag::err_for_range_storage_class) - << VD->getDeclName() << Error; + << VD << Error; D->setInvalidDecl(); } } @@ -13473,9 +13505,8 @@ Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D) { if (D.getDeclSpec().isModulePrivateSpecified()) Diag(New->getLocation(), diag::err_module_private_local) - << 1 << New->getDeclName() - << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) - << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); + << 1 << New << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) + << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); if (New->hasAttr()) { Diag(New->getLocation(), diag::err_block_on_nonlocal); @@ -13527,8 +13558,7 @@ void Sema::DiagnoseSizeOfParametersAndReturnValue( if (!ReturnTy->isDependentType() && ReturnTy.isPODType(Context)) { unsigned Size = Context.getTypeSizeInChars(ReturnTy).getQuantity(); if (Size > LangOpts.NumLargeByValueCopy) - Diag(D->getLocation(), diag::warn_return_value_size) - << D->getDeclName() << Size; + Diag(D->getLocation(), diag::warn_return_value_size) << D << Size; } // Warn if any parameter is pass-by-value and larger than the specified @@ -13540,7 +13570,7 @@ void Sema::DiagnoseSizeOfParametersAndReturnValue( unsigned Size = Context.getTypeSizeInChars(T).getQuantity(); if (Size > LangOpts.NumLargeByValueCopy) Diag(Parameter->getLocation(), diag::warn_parameter_size) - << Parameter->getDeclName() << Size; + << Parameter << Size; } } @@ -13848,9 +13878,9 @@ Sema::CheckForFunctionRedefinition(FunctionDecl *FD, if (getLangOpts().GNUMode && Definition->isInlineSpecified() && Definition->getStorageClass() == SC_Extern) Diag(FD->getLocation(), diag::err_redefinition_extern_inline) - << FD->getDeclName() << getLangOpts().CPlusPlus; + << FD << getLangOpts().CPlusPlus; else - Diag(FD->getLocation(), diag::err_redefinition) << FD->getDeclName(); + Diag(FD->getLocation(), diag::err_redefinition) << FD; Diag(Definition->getLocation(), diag::note_previous_definition); FD->setInvalidDecl(); @@ -14905,9 +14935,10 @@ TypedefDecl *Sema::ParseTypedefDecl(Scope *S, Declarator &D, QualType T, if (D.getDeclSpec().isModulePrivateSpecified()) { if (CurContext->isFunctionOrMethod()) Diag(NewTD->getLocation(), diag::err_module_private_local) - << 2 << NewTD->getDeclName() - << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) - << FixItHint::CreateRemoval(D.getDeclSpec().getModulePrivateSpecLoc()); + << 2 << NewTD + << SourceRange(D.getDeclSpec().getModulePrivateSpecLoc()) + << FixItHint::CreateRemoval( + D.getDeclSpec().getModulePrivateSpecLoc()); else NewTD->setModulePrivate(); } @@ -15295,6 +15326,10 @@ Decl *Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, isMemberSpecialization = true; } } + + if (!TemplateParameterLists.empty() && isMemberSpecialization && + CheckTemplateDeclScope(S, TemplateParameterLists.back())) + return nullptr; } // Figure out the underlying type if this a enum declaration. We need to do @@ -17294,7 +17329,7 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl, CXXRecordDecl *CXXRecord = cast(Record); CheckForZeroSize = CXXRecord->getLexicalDeclContext()->isExternCContext() && - !CXXRecord->isDependentType() && + !CXXRecord->isDependentType() && !inTemplateInstantiation() && CXXRecord->isCLike(); } if (CheckForZeroSize) { diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 054f880600407..dfdc73fbbb74f 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3798,15 +3798,15 @@ static void handleTransparentUnionAttr(Sema &S, Decl *D, const ParsedAttr &AL) { S.Context.getTypeAlign(FieldType) > FirstAlign) { // Warn if we drop the attribute. bool isSize = S.Context.getTypeSize(FieldType) != FirstSize; - unsigned FieldBits = isSize? S.Context.getTypeSize(FieldType) - : S.Context.getTypeAlign(FieldType); + unsigned FieldBits = isSize ? S.Context.getTypeSize(FieldType) + : S.Context.getTypeAlign(FieldType); S.Diag(Field->getLocation(), - diag::warn_transparent_union_attribute_field_size_align) - << isSize << Field->getDeclName() << FieldBits; - unsigned FirstBits = isSize? FirstSize : FirstAlign; + diag::warn_transparent_union_attribute_field_size_align) + << isSize << *Field << FieldBits; + unsigned FirstBits = isSize ? FirstSize : FirstAlign; S.Diag(FirstField->getLocation(), diag::note_transparent_union_first_field_size_align) - << isSize << FirstBits; + << isSize << FirstBits; return; } } diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 97f74fa436a7a..532e7d5b9cdc2 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -94,7 +94,7 @@ static void DiagnoseUnusedOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc) { A->getSemanticSpelling() != UnusedAttr::C2x_maybe_unused) { const Decl *DC = cast_or_null(S.getCurObjCLexicalContext()); if (DC && !DC->hasAttr()) - S.Diag(Loc, diag::warn_used_but_marked_unused) << D->getDeclName(); + S.Diag(Loc, diag::warn_used_but_marked_unused) << D; } } } @@ -4365,7 +4365,6 @@ static void captureVariablyModifiedType(ASTContext &Context, QualType T, case Type::UnaryTransform: case Type::Attributed: case Type::SubstTemplateTypeParm: - case Type::PackExpansion: case Type::MacroQualified: // Keep walking after single level desugaring. T = T.getSingleStepDesugaredType(Context); @@ -5587,9 +5586,8 @@ bool Sema::CheckCXXDefaultArgExpr(SourceLocation CallLoc, FunctionDecl *FD, return true; } - Diag(CallLoc, - diag::err_use_of_default_argument_to_function_declared_later) << - FD << cast(FD->getDeclContext())->getDeclName(); + Diag(CallLoc, diag::err_use_of_default_argument_to_function_declared_later) + << FD << cast(FD->getDeclContext()); Diag(UnparsedDefaultArgLocs[Param], diag::note_default_argument_declared_here); return true; @@ -16959,8 +16957,7 @@ static bool isVariableCapturable(CapturingScopeInfo *CSI, VarDecl *Var, if (Var->getType()->isVariablyModifiedType() && IsBlock) { if (Diagnose) { S.Diag(Loc, diag::err_ref_vm_type); - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; } return false; } @@ -16972,10 +16969,8 @@ static bool isVariableCapturable(CapturingScopeInfo *CSI, VarDecl *Var, if (IsBlock) S.Diag(Loc, diag::err_ref_flexarray_type); else - S.Diag(Loc, diag::err_lambda_capture_flexarray_type) - << Var->getDeclName(); - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Loc, diag::err_lambda_capture_flexarray_type) << Var; + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; } return false; } @@ -16985,10 +16980,8 @@ static bool isVariableCapturable(CapturingScopeInfo *CSI, VarDecl *Var, // variables; they don't support the expected semantics. if (HasBlocksAttr && (IsLambda || isa(CSI))) { if (Diagnose) { - S.Diag(Loc, diag::err_capture_block_variable) - << Var->getDeclName() << !IsLambda; - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Loc, diag::err_capture_block_variable) << Var << !IsLambda; + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; } return false; } @@ -17019,8 +17012,7 @@ static bool captureInBlock(BlockScopeInfo *BSI, VarDecl *Var, if (!Invalid && !S.getLangOpts().OpenCL && CaptureType->isArrayType()) { if (BuildAndDiagnose) { S.Diag(Loc, diag::err_ref_array_type); - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; Invalid = true; } else { return false; @@ -17033,8 +17025,7 @@ static bool captureInBlock(BlockScopeInfo *BSI, VarDecl *Var, if (BuildAndDiagnose) { S.Diag(Loc, diag::err_arc_autoreleasing_capture) << /*block*/ 0; - S.Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + S.Diag(Var->getLocation(), diag::note_previous_decl) << Var; Invalid = true; } else { return false; @@ -17304,9 +17295,8 @@ bool Sema::tryCaptureVariable( if (BuildAndDiagnose) { LambdaScopeInfo *LSI = cast(CSI); if (LSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_None) { - Diag(ExprLoc, diag::err_lambda_impcap) << Var->getDeclName(); - Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + Diag(ExprLoc, diag::err_lambda_impcap) << Var; + Diag(Var->getLocation(), diag::note_previous_decl) << Var; Diag(LSI->Lambda->getBeginLoc(), diag::note_lambda_decl); } else diagnoseUncapturableValueReference(*this, ExprLoc, Var, DC); @@ -17380,9 +17370,8 @@ bool Sema::tryCaptureVariable( // No capture-default, and this is not an explicit capture // so cannot capture this variable. if (BuildAndDiagnose) { - Diag(ExprLoc, diag::err_lambda_impcap) << Var->getDeclName(); - Diag(Var->getLocation(), diag::note_previous_decl) - << Var->getDeclName(); + Diag(ExprLoc, diag::err_lambda_impcap) << Var; + Diag(Var->getLocation(), diag::note_previous_decl) << Var; if (cast(CSI)->Lambda) Diag(cast(CSI)->Lambda->getBeginLoc(), diag::note_lambda_decl); @@ -18346,7 +18335,7 @@ bool Sema::CheckCallReturnType(QualType ReturnType, SourceLocation Loc, } S.Diag(Loc, diag::err_call_function_incomplete_return) - << CE->getSourceRange() << FD->getDeclName() << T; + << CE->getSourceRange() << FD << T; S.Diag(FD->getLocation(), diag::note_entity_declared_at) << FD->getDeclName(); } diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index ebfc1ec4b9749..466d1fe59c715 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -231,12 +231,10 @@ static void diagnoseInstanceReference(Sema &SemaRef, } /// Builds an expression which might be an implicit member expression. -ExprResult -Sema::BuildPossibleImplicitMemberExpr(const CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - LookupResult &R, - const TemplateArgumentListInfo *TemplateArgs, - const Scope *S) { +ExprResult Sema::BuildPossibleImplicitMemberExpr( + const CXXScopeSpec &SS, SourceLocation TemplateKWLoc, LookupResult &R, + const TemplateArgumentListInfo *TemplateArgs, const Scope *S, + UnresolvedLookupExpr *AsULE) { switch (ClassifyImplicitMemberAccess(*this, R)) { case IMA_Instance: return BuildImplicitMemberExpr(SS, TemplateKWLoc, R, TemplateArgs, true, S); @@ -257,7 +255,7 @@ Sema::BuildPossibleImplicitMemberExpr(const CXXScopeSpec &SS, case IMA_Unresolved_StaticContext: if (TemplateArgs || TemplateKWLoc.isValid()) return BuildTemplateIdExpr(SS, TemplateKWLoc, R, false, TemplateArgs); - return BuildDeclarationNameExpr(SS, R, false); + return AsULE ? AsULE : BuildDeclarationNameExpr(SS, R, false); case IMA_Error_StaticContext: case IMA_Error_Unrelated: diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp index 657ed13f207ad..c9f2854f7accf 100644 --- a/clang/lib/Sema/SemaLambda.cpp +++ b/clang/lib/Sema/SemaLambda.cpp @@ -803,7 +803,8 @@ QualType Sema::buildLambdaInitCaptureInitialization( Diag(EllipsisLoc, getLangOpts().CPlusPlus20 ? diag::warn_cxx17_compat_init_capture_pack : diag::ext_init_capture_pack); - DeductType = Context.getPackExpansionType(DeductType, NumExpansions); + DeductType = Context.getPackExpansionType(DeductType, NumExpansions, + /*ExpectPackInType=*/false); TLB.push(DeductType).setEllipsisLoc(EllipsisLoc); } else { // Just ignore the ellipsis for now and form a non-pack variable. We'll @@ -1623,8 +1624,9 @@ FieldDecl *Sema::BuildCaptureField(RecordDecl *RD, // Build the non-static data member. FieldDecl *Field = - FieldDecl::Create(Context, RD, Loc, Loc, nullptr, FieldType, TSI, nullptr, - false, ICIS_NoInit); + FieldDecl::Create(Context, RD, /*StartLoc=*/Loc, /*IdLoc=*/Loc, + /*Id=*/nullptr, FieldType, TSI, /*BW=*/nullptr, + /*Mutable=*/false, ICIS_NoInit); // If the variable being captured has an invalid type, mark the class as // invalid as well. if (!FieldType->isDependentType()) { @@ -1784,7 +1786,7 @@ ExprResult Sema::BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc, CUDACheckLambdaCapture(CallOperator, From); } - Class->setCaptures(Captures); + Class->setCaptures(Context, Captures); // C++11 [expr.prim.lambda]p6: // The closure type for a lambda-expression with no lambda-capture diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 51609e37e20ca..0367d5c8c6016 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -2244,7 +2244,11 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, [](OpenMPDirectiveKind K) { return isOpenMPTaskingDirective(K); }, Level)) { bool IsTriviallyCopyable = - D->getType().getNonReferenceType().isTriviallyCopyableType(Context); + D->getType().getNonReferenceType().isTriviallyCopyableType(Context) && + !D->getType() + .getNonReferenceType() + .getCanonicalType() + ->getAsCXXRecordDecl(); OpenMPDirectiveKind DKind = DSAStack->getDirective(Level); SmallVector CaptureRegions; getOpenMPCaptureRegions(CaptureRegions, DKind); @@ -5891,8 +5895,15 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope, return Call; ASTContext &Context = getASTContext(); - OMPContext OMPCtx(getLangOpts().OpenMPIsDevice, - Context.getTargetInfo().getTriple()); + std::function DiagUnknownTrait = [this, + CE](StringRef ISATrait) { + // TODO Track the selector locations in a way that is accessible here to + // improve the diagnostic location. + Diag(CE->getBeginLoc(), diag::warn_unknown_declare_variant_isa_trait) + << ISATrait; + }; + TargetOMPContext OMPCtx(Context, std::move(DiagUnknownTrait), + getCurFunctionDecl()); SmallVector Exprs; SmallVector VMIs; @@ -5904,7 +5915,8 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope, VariantMatchInfo VMI; OMPTraitInfo &TI = A->getTraitInfo(); TI.getAsVariantMatchInfo(Context, VMI); - if (!isVariantApplicableInContext(VMI, OMPCtx, /* DeviceSetOnly */ false)) + if (!isVariantApplicableInContext(VMI, OMPCtx, + /* DeviceSetOnly */ false)) continue; VMIs.push_back(VMI); @@ -13532,7 +13544,9 @@ OMPClause *Sema::ActOnOpenMPVarListClause( DeclarationNameInfo &ReductionOrMapperId, int ExtraModifier, ArrayRef MapTypeModifiers, ArrayRef MapTypeModifiersLoc, bool IsMapTypeImplicit, - SourceLocation ExtraModifierLoc) { + SourceLocation ExtraModifierLoc, + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc) { SourceLocation StartLoc = Locs.StartLoc; SourceLocation LParenLoc = Locs.LParenLoc; SourceLocation EndLoc = Locs.EndLoc; @@ -13609,12 +13623,14 @@ OMPClause *Sema::ActOnOpenMPVarListClause( IsMapTypeImplicit, ExtraModifierLoc, ColonLoc, VarList, Locs); break; case OMPC_to: - Res = ActOnOpenMPToClause(VarList, ReductionOrMapperIdScopeSpec, - ReductionOrMapperId, Locs); + Res = ActOnOpenMPToClause(MotionModifiers, MotionModifiersLoc, + ReductionOrMapperIdScopeSpec, ReductionOrMapperId, + ColonLoc, VarList, Locs); break; case OMPC_from: - Res = ActOnOpenMPFromClause(VarList, ReductionOrMapperIdScopeSpec, - ReductionOrMapperId, Locs); + Res = ActOnOpenMPFromClause(MotionModifiers, MotionModifiersLoc, + ReductionOrMapperIdScopeSpec, + ReductionOrMapperId, ColonLoc, VarList, Locs); break; case OMPC_use_device_ptr: Res = ActOnOpenMPUseDevicePtrClause(VarList, Locs); @@ -18572,11 +18588,31 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, checkDeclInTargetContext(E->getExprLoc(), E->getSourceRange(), *this, D); } -OMPClause *Sema::ActOnOpenMPToClause(ArrayRef VarList, - CXXScopeSpec &MapperIdScopeSpec, - DeclarationNameInfo &MapperId, - const OMPVarListLocTy &Locs, - ArrayRef UnresolvedMappers) { +OMPClause *Sema::ActOnOpenMPToClause( + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, + SourceLocation ColonLoc, ArrayRef VarList, + const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { + OpenMPMotionModifierKind Modifiers[] = {OMPC_MOTION_MODIFIER_unknown, + OMPC_MOTION_MODIFIER_unknown}; + SourceLocation ModifiersLoc[NumberOfOMPMotionModifiers]; + + // Process motion-modifiers, flag errors for duplicate modifiers. + unsigned Count = 0; + for (unsigned I = 0, E = MotionModifiers.size(); I < E; ++I) { + if (MotionModifiers[I] != OMPC_MOTION_MODIFIER_unknown && + llvm::find(Modifiers, MotionModifiers[I]) != std::end(Modifiers)) { + Diag(MotionModifiersLoc[I], diag::err_omp_duplicate_motion_modifier); + continue; + } + assert(Count < NumberOfOMPMotionModifiers && + "Modifiers exceed the allowed number of motion modifiers"); + Modifiers[Count] = MotionModifiers[I]; + ModifiersLoc[Count] = MotionModifiersLoc[I]; + ++Count; + } + MappableVarListInfo MVLI(VarList); checkMappableExpressionList(*this, DSAStack, OMPC_to, MVLI, Locs.StartLoc, MapperIdScopeSpec, MapperId, UnresolvedMappers); @@ -18585,15 +18621,35 @@ OMPClause *Sema::ActOnOpenMPToClause(ArrayRef VarList, return OMPToClause::Create( Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, - MVLI.VarComponents, MVLI.UDMapperList, + MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(Context), MapperId); } -OMPClause *Sema::ActOnOpenMPFromClause(ArrayRef VarList, - CXXScopeSpec &MapperIdScopeSpec, - DeclarationNameInfo &MapperId, - const OMPVarListLocTy &Locs, - ArrayRef UnresolvedMappers) { +OMPClause *Sema::ActOnOpenMPFromClause( + ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, + SourceLocation ColonLoc, ArrayRef VarList, + const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { + OpenMPMotionModifierKind Modifiers[] = {OMPC_MOTION_MODIFIER_unknown, + OMPC_MOTION_MODIFIER_unknown}; + SourceLocation ModifiersLoc[NumberOfOMPMotionModifiers]; + + // Process motion-modifiers, flag errors for duplicate modifiers. + unsigned Count = 0; + for (unsigned I = 0, E = MotionModifiers.size(); I < E; ++I) { + if (MotionModifiers[I] != OMPC_MOTION_MODIFIER_unknown && + llvm::find(Modifiers, MotionModifiers[I]) != std::end(Modifiers)) { + Diag(MotionModifiersLoc[I], diag::err_omp_duplicate_motion_modifier); + continue; + } + assert(Count < NumberOfOMPMotionModifiers && + "Modifiers exceed the allowed number of motion modifiers"); + Modifiers[Count] = MotionModifiers[I]; + ModifiersLoc[Count] = MotionModifiersLoc[I]; + ++Count; + } + MappableVarListInfo MVLI(VarList); checkMappableExpressionList(*this, DSAStack, OMPC_from, MVLI, Locs.StartLoc, MapperIdScopeSpec, MapperId, UnresolvedMappers); @@ -18602,7 +18658,7 @@ OMPClause *Sema::ActOnOpenMPFromClause(ArrayRef VarList, return OMPFromClause::Create( Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, - MVLI.VarComponents, MVLI.UDMapperList, + MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(Context), MapperId); } diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 5b4e7a2fdafa5..00563cff62cf6 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -14193,12 +14193,12 @@ Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE, Diag(MemExpr->getBeginLoc(), diag::warn_call_to_pure_virtual_member_function_from_ctor_dtor) << MD->getDeclName() << isa(CurContext) - << MD->getParent()->getDeclName(); + << MD->getParent(); Diag(MD->getBeginLoc(), diag::note_previous_decl) << MD->getDeclName(); if (getLangOpts().AppleKext) Diag(MemExpr->getBeginLoc(), diag::note_pure_qualified_call_kext) - << MD->getParent()->getDeclName() << MD->getDeclName(); + << MD->getParent() << MD->getDeclName(); } } diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index ad5a3768846c7..69f9061f7c482 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -825,7 +825,7 @@ class KernelObjVisitor { else if (ElementTy->isArrayType()) VisitArrayElements(ArrayField, ElementTy, handlers...); else if (ElementTy->isScalarType()) - KF_FOR_EACH(handleScalarType, ArrayField, ElementTy); + KF_FOR_EACH(handleSimpleType, ArrayField, ElementTy); } template @@ -941,9 +941,10 @@ class KernelObjVisitor { else if (FieldTy->isArrayType()) { if (KF_FOR_EACH(handleArrayType, Field, FieldTy)) VisitArrayElements(Field, FieldTy, handlers...); - } else if (FieldTy->isScalarType() || FieldTy->isVectorType()) - KF_FOR_EACH(handleScalarType, Field, FieldTy); - else + } else if (FieldTy->isScalarType() || FieldTy->isUnionType() || + FieldTy->isVectorType()) { + KF_FOR_EACH(handleSimpleType, Field, FieldTy); + } else KF_FOR_EACH(handleOtherType, Field, FieldTy); (void)std::initializer_list{ (handlers.leaveField(Owner, Field), 0)...}; @@ -1000,7 +1001,7 @@ class SyclKernelFieldHandler { virtual bool handleReferenceType(FieldDecl *, QualType) { return true; } virtual bool handlePointerType(FieldDecl *, QualType) { return true; } virtual bool handleArrayType(FieldDecl *, QualType) { return true; } - virtual bool handleScalarType(FieldDecl *, QualType) { return true; } + virtual bool handleSimpleType(FieldDecl *, QualType) { return true; } // Most handlers shouldn't be handling this, just the field checker. virtual bool handleOtherType(FieldDecl *, QualType) { return true; } @@ -1286,7 +1287,7 @@ class SyclKernelDeclCreator : public SyclKernelFieldHandler { return true; } - bool handleScalarType(FieldDecl *FD, QualType FieldTy) final { + bool handleSimpleType(FieldDecl *FD, QualType FieldTy) final { addParam(FD, FieldTy); return true; } @@ -1605,7 +1606,7 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler { return true; } - bool handleScalarType(FieldDecl *FD, QualType FieldTy) final { + bool handleSimpleType(FieldDecl *FD, QualType FieldTy) final { if (dyn_cast(MemberExprBases.back())) createExprForScalarElement(FD); else @@ -1812,7 +1813,7 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { return true; } - bool handleScalarType(FieldDecl *FD, QualType FieldTy) final { + bool handleSimpleType(FieldDecl *FD, QualType FieldTy) final { addParam(FD, FieldTy, SYCLIntegrationHeader::kind_std_layout); return true; } diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 33184d4e2fcf6..4a12ba4acbbde 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3625,12 +3625,11 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) { if (FD->hasAttrs()) Attrs = &FD->getAttrs(); if (FD->isNoReturn()) - Diag(ReturnLoc, diag::warn_noreturn_function_has_return_expr) - << FD->getDeclName(); + Diag(ReturnLoc, diag::warn_noreturn_function_has_return_expr) << FD; if (FD->isMain() && RetValExp) if (isa(RetValExp)) Diag(ReturnLoc, diag::warn_main_returns_bool_literal) - << RetValExp->getSourceRange(); + << RetValExp->getSourceRange(); if (FD->hasAttr() && RetValExp) { if (const auto *RT = dyn_cast(FnRetType.getCanonicalType())) { if (RT->getDecl()->isOrContainsUnion()) @@ -3701,8 +3700,7 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) { FunctionKind = 3; Diag(ReturnLoc, diag::err_return_init_list) - << CurDecl->getDeclName() << FunctionKind - << RetValExp->getSourceRange(); + << CurDecl << FunctionKind << RetValExp->getSourceRange(); // Drop the expression. RetValExp = nullptr; @@ -3729,9 +3727,8 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) { // return of void in constructor/destructor is illegal in C++. if (D == diag::err_ctor_dtor_returns_void) { NamedDecl *CurDecl = getCurFunctionOrMethodDecl(); - Diag(ReturnLoc, D) - << CurDecl->getDeclName() << isa(CurDecl) - << RetValExp->getSourceRange(); + Diag(ReturnLoc, D) << CurDecl << isa(CurDecl) + << RetValExp->getSourceRange(); } // return (some void expression); is legal in C++. else if (D != diag::ext_return_has_void_expr || @@ -3747,8 +3744,7 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp) { FunctionKind = 3; Diag(ReturnLoc, D) - << CurDecl->getDeclName() << FunctionKind - << RetValExp->getSourceRange(); + << CurDecl << FunctionKind << RetValExp->getSourceRange(); } } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index c05ed0b14e3eb..3991f2b479778 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -7771,22 +7771,28 @@ Sema::CheckTemplateDeclScope(Scope *S, TemplateParameterList *TemplateParams) { (S->getFlags() & Scope::TemplateParamScope) != 0) S = S->getParent(); - // C++ [temp]p4: - // A template [...] shall not have C linkage. + // C++ [temp.pre]p6: [P2096] + // A template, explicit specialization, or partial specialization shall not + // have C linkage. DeclContext *Ctx = S->getEntity(); - assert(Ctx && "Unknown context"); - if (Ctx->isExternCContext()) { + if (Ctx && Ctx->isExternCContext()) { Diag(TemplateParams->getTemplateLoc(), diag::err_template_linkage) << TemplateParams->getSourceRange(); if (const LinkageSpecDecl *LSD = Ctx->getExternCContext()) Diag(LSD->getExternLoc(), diag::note_extern_c_begins_here); return true; } - Ctx = Ctx->getRedeclContext(); + Ctx = Ctx ? Ctx->getRedeclContext() : nullptr; // C++ [temp]p2: // A template-declaration can appear only as a namespace scope or // class scope declaration. + // C++ [temp.expl.spec]p3: + // An explicit specialization may be declared in any scope in which the + // corresponding primary template may be defined. + // C++ [temp.class.spec]p6: [P2096] + // A partial specialization may be declared in any scope in which the + // corresponding primary template may be defined. if (Ctx) { if (Ctx->isFileContext()) return false; @@ -8106,6 +8112,10 @@ DeclResult Sema::ActOnClassTemplateSpecialization( if (Invalid) return true; + // Check that we can declare a template specialization here. + if (TemplateParams && CheckTemplateDeclScope(S, TemplateParams)) + return true; + if (TemplateParams && TemplateParams->size() > 0) { isPartialSpecialization = true; diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 52062e9a5039c..7aa94502fa846 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -1201,6 +1201,120 @@ static bool isForwardingReference(QualType Param, unsigned FirstInnerIndex) { return false; } +/// Attempt to deduce the template arguments by checking the base types +/// according to (C++20 [temp.deduct.call] p4b3. +/// +/// \param S the semantic analysis object within which we are deducing. +/// +/// \param RecordT the top level record object we are deducing against. +/// +/// \param TemplateParams the template parameters that we are deducing. +/// +/// \param SpecParam the template specialization parameter type. +/// +/// \param Info information about the template argument deduction itself. +/// +/// \param Deduced the deduced template arguments. +/// +/// \returns the result of template argument deduction with the bases. "invalid" +/// means no matches, "success" found a single item, and the +/// "MiscellaneousDeductionFailure" result happens when the match is ambiguous. +static Sema::TemplateDeductionResult DeduceTemplateBases( + Sema &S, const RecordType *RecordT, TemplateParameterList *TemplateParams, + const TemplateSpecializationType *SpecParam, TemplateDeductionInfo &Info, + SmallVectorImpl &Deduced) { + // C++14 [temp.deduct.call] p4b3: + // If P is a class and P has the form simple-template-id, then the + // transformed A can be a derived class of the deduced A. Likewise if + // P is a pointer to a class of the form simple-template-id, the + // transformed A can be a pointer to a derived class pointed to by the + // deduced A. However, if there is a class C that is a (direct or + // indirect) base class of D and derived (directly or indirectly) from a + // class B and that would be a valid deduced A, the deduced A cannot be + // B or pointer to B, respectively. + // + // These alternatives are considered only if type deduction would + // otherwise fail. If they yield more than one possible deduced A, the + // type deduction fails. + + // Use a breadth-first search through the bases to collect the set of + // successful matches. Visited contains the set of nodes we have already + // visited, while ToVisit is our stack of records that we still need to + // visit. Matches contains a list of matches that have yet to be + // disqualified. + llvm::SmallPtrSet Visited; + SmallVector ToVisit; + // We iterate over this later, so we have to use MapVector to ensure + // determinism. + llvm::MapVector> + Matches; + + auto AddBases = [&Visited, &ToVisit](const RecordType *RT) { + CXXRecordDecl *RD = cast(RT->getDecl()); + for (const auto &Base : RD->bases()) { + assert(Base.getType()->isRecordType() && + "Base class that isn't a record?"); + const RecordType *RT = Base.getType()->getAs(); + if (Visited.insert(RT).second) + ToVisit.push_back(Base.getType()->getAs()); + } + }; + + // Set up the loop by adding all the bases. + AddBases(RecordT); + + // Search each path of bases until we either run into a successful match + // (where all bases of it are invalid), or we run out of bases. + while (!ToVisit.empty()) { + const RecordType *NextT = ToVisit.pop_back_val(); + + SmallVector DeducedCopy(Deduced.begin(), + Deduced.end()); + TemplateDeductionInfo BaseInfo(TemplateDeductionInfo::ForBase, Info); + Sema::TemplateDeductionResult BaseResult = + DeduceTemplateArguments(S, TemplateParams, SpecParam, + QualType(NextT, 0), BaseInfo, DeducedCopy); + + // If this was a successful deduction, add it to the list of matches, + // otherwise we need to continue searching its bases. + if (BaseResult == Sema::TDK_Success) + Matches.insert({NextT, DeducedCopy}); + else + AddBases(NextT); + } + + // At this point, 'Matches' contains a list of seemingly valid bases, however + // in the event that we have more than 1 match, it is possible that the base + // of one of the matches might be disqualified for being a base of another + // valid match. We can count on cyclical instantiations being invalid to + // simplify the disqualifications. That is, if A & B are both matches, and B + // inherits from A (disqualifying A), we know that A cannot inherit from B. + if (Matches.size() > 1) { + Visited.clear(); + for (const auto &Match : Matches) + AddBases(Match.first); + + // We can give up once we have a single item (or have run out of things to + // search) since cyclical inheritence isn't valid. + while (Matches.size() > 1 && !ToVisit.empty()) { + const RecordType *NextT = ToVisit.pop_back_val(); + Matches.erase(NextT); + + // Always add all bases, since the inheritence tree can contain + // disqualifications for multiple matches. + AddBases(NextT); + } + } + + if (Matches.empty()) + return Sema::TDK_Invalid; + if (Matches.size() > 1) + return Sema::TDK_MiscellaneousDeductionFailure; + + std::swap(Matches.front().second, Deduced); + return Sema::TDK_Success; +} + /// Deduce the template arguments by comparing the parameter type and /// the argument type (C++ [temp.deduct.type]). /// @@ -1787,78 +1901,15 @@ DeduceTemplateArgumentsByTypeMatch(Sema &S, if (!S.isCompleteType(Info.getLocation(), Arg)) return Result; - // C++14 [temp.deduct.call] p4b3: - // If P is a class and P has the form simple-template-id, then the - // transformed A can be a derived class of the deduced A. Likewise if - // P is a pointer to a class of the form simple-template-id, the - // transformed A can be a pointer to a derived class pointed to by the - // deduced A. - // - // These alternatives are considered only if type deduction would - // otherwise fail. If they yield more than one possible deduced A, the - // type deduction fails. - // Reset the incorrectly deduced argument from above. Deduced = DeducedOrig; - // Use data recursion to crawl through the list of base classes. - // Visited contains the set of nodes we have already visited, while - // ToVisit is our stack of records that we still need to visit. - llvm::SmallPtrSet Visited; - SmallVector ToVisit; - ToVisit.push_back(RecordT); - bool Successful = false; - SmallVector SuccessfulDeduced; - while (!ToVisit.empty()) { - // Retrieve the next class in the inheritance hierarchy. - const RecordType *NextT = ToVisit.pop_back_val(); - - // If we have already seen this type, skip it. - if (!Visited.insert(NextT).second) - continue; - - // If this is a base class, try to perform template argument - // deduction from it. - if (NextT != RecordT) { - TemplateDeductionInfo BaseInfo(TemplateDeductionInfo::ForBase, Info); - Sema::TemplateDeductionResult BaseResult = - DeduceTemplateArguments(S, TemplateParams, SpecParam, - QualType(NextT, 0), BaseInfo, Deduced); - - // If template argument deduction for this base was successful, - // note that we had some success. Otherwise, ignore any deductions - // from this base class. - if (BaseResult == Sema::TDK_Success) { - // If we've already seen some success, then deduction fails due to - // an ambiguity (temp.deduct.call p5). - if (Successful) - return Sema::TDK_MiscellaneousDeductionFailure; - - Successful = true; - std::swap(SuccessfulDeduced, Deduced); - - Info.Param = BaseInfo.Param; - Info.FirstArg = BaseInfo.FirstArg; - Info.SecondArg = BaseInfo.SecondArg; - } - - Deduced = DeducedOrig; - } - - // Visit base classes - CXXRecordDecl *Next = cast(NextT->getDecl()); - for (const auto &Base : Next->bases()) { - assert(Base.getType()->isRecordType() && - "Base class that isn't a record?"); - ToVisit.push_back(Base.getType()->getAs()); - } - } - - if (Successful) { - std::swap(SuccessfulDeduced, Deduced); - return Sema::TDK_Success; - } + // Check bases according to C++14 [temp.deduct.call] p4b3: + Sema::TemplateDeductionResult BaseResult = DeduceTemplateBases( + S, RecordT, TemplateParams, SpecParam, Info, Deduced); + if (BaseResult != Sema::TDK_Invalid) + return BaseResult; return Result; } @@ -3815,8 +3866,11 @@ static bool AdjustFunctionParmAndArgTypesForDeduction( // If P is a forwarding reference and the argument is an lvalue, the type // "lvalue reference to A" is used in place of A for type deduction. if (isForwardingReference(QualType(ParamRefType, 0), FirstInnerIndex) && - Arg->isLValue()) + Arg->isLValue()) { + if (S.getLangOpts().OpenCL) + ArgType = S.Context.getAddrSpaceQualType(ArgType, LangAS::opencl_generic); ArgType = S.Context.getLValueReferenceType(ArgType); + } } else { // C++ [temp.deduct.call]p2: // If P is not a reference type: @@ -4893,6 +4947,13 @@ QualType Sema::ReplaceAutoType(QualType TypeWithAuto, .TransformType(TypeWithAuto); } +TypeSourceInfo *Sema::ReplaceAutoTypeSourceInfo(TypeSourceInfo *TypeWithAuto, + QualType TypeToReplaceAuto) { + return SubstituteDeducedTypeTransform(*this, TypeToReplaceAuto, + /*UseTypeSugar*/ false) + .TransformType(TypeWithAuto); +} + void Sema::DiagnoseAutoDeductionFailure(VarDecl *VDecl, Expr *Init) { if (isa(Init)) Diag(VDecl->getLocation(), diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 7b77d1cb482ae..259cc51657763 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -614,7 +614,8 @@ QualType Sema::CheckPackExpansion(QualType Pattern, SourceRange PatternRange, return QualType(); } - return Context.getPackExpansionType(Pattern, NumExpansions); + return Context.getPackExpansionType(Pattern, NumExpansions, + /*ExpectPackInType=*/false); } ExprResult Sema::ActOnPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc) { diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 9ce5ec66dd814..14245f0392d6c 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -3110,24 +3110,10 @@ static void diagnoseRedundantReturnTypeQualifiers(Sema &S, QualType RetTy, D.getDeclSpec().getUnalignedSpecLoc()); } -static void CopyTypeConstraintFromAutoType(Sema &SemaRef, const AutoType *Auto, - AutoTypeLoc AutoLoc, - TemplateTypeParmDecl *TP, - SourceLocation EllipsisLoc) { - - TemplateArgumentListInfo TAL(AutoLoc.getLAngleLoc(), AutoLoc.getRAngleLoc()); - for (unsigned Idx = 0; Idx < AutoLoc.getNumArgs(); ++Idx) - TAL.addArgument(AutoLoc.getArgLoc(Idx)); - - SemaRef.AttachTypeConstraint( - AutoLoc.getNestedNameSpecifierLoc(), AutoLoc.getConceptNameInfo(), - AutoLoc.getNamedConcept(), - AutoLoc.hasExplicitTemplateArgs() ? &TAL : nullptr, TP, EllipsisLoc); -} - -static QualType InventTemplateParameter( - TypeProcessingState &state, QualType T, TypeSourceInfo *TSI, AutoType *Auto, - InventedTemplateParameterInfo &Info) { +static std::pair +InventTemplateParameter(TypeProcessingState &state, QualType T, + TypeSourceInfo *TrailingTSI, AutoType *Auto, + InventedTemplateParameterInfo &Info) { Sema &S = state.getSema(); Declarator &D = state.getDeclarator(); @@ -3152,13 +3138,25 @@ static QualType InventTemplateParameter( IsParameterPack, /*HasTypeConstraint=*/Auto->isConstrained()); InventedTemplateParam->setImplicit(); Info.TemplateParams.push_back(InventedTemplateParam); - // Attach type constraints + + // Attach type constraints to the new parameter. if (Auto->isConstrained()) { - if (TSI) { - CopyTypeConstraintFromAutoType( - S, Auto, TSI->getTypeLoc().getContainedAutoTypeLoc(), - InventedTemplateParam, D.getEllipsisLoc()); + if (TrailingTSI) { + // The 'auto' appears in a trailing return type we've already built; + // extract its type constraints to attach to the template parameter. + AutoTypeLoc AutoLoc = TrailingTSI->getTypeLoc().getContainedAutoTypeLoc(); + TemplateArgumentListInfo TAL(AutoLoc.getLAngleLoc(), AutoLoc.getRAngleLoc()); + for (unsigned Idx = 0; Idx < AutoLoc.getNumArgs(); ++Idx) + TAL.addArgument(AutoLoc.getArgLoc(Idx)); + + S.AttachTypeConstraint(AutoLoc.getNestedNameSpecifierLoc(), + AutoLoc.getConceptNameInfo(), + AutoLoc.getNamedConcept(), + AutoLoc.hasExplicitTemplateArgs() ? &TAL : nullptr, + InventedTemplateParam, D.getEllipsisLoc()); } else { + // The 'auto' appears in the decl-specifiers; we've not finished forming + // TypeSourceInfo for it yet. TemplateIdAnnotation *TemplateId = D.getDeclSpec().getRepAsTemplateId(); TemplateArgumentListInfo TemplateArgsInfo; if (TemplateId->LAngleLoc.isValid()) { @@ -3176,15 +3174,16 @@ static QualType InventTemplateParameter( } } - // If TSI is nullptr, this is a constrained declspec auto and the type - // constraint will be attached later in TypeSpecLocFiller - // Replace the 'auto' in the function parameter with this invented // template type parameter. // FIXME: Retain some type sugar to indicate that this was written // as 'auto'? - return state.ReplaceAutoType( - T, QualType(InventedTemplateParam->getTypeForDecl(), 0)); + QualType Replacement(InventedTemplateParam->getTypeForDecl(), 0); + QualType NewT = state.ReplaceAutoType(T, Replacement); + TypeSourceInfo *NewTSI = + TrailingTSI ? S.ReplaceAutoTypeSourceInfo(TrailingTSI, Replacement) + : nullptr; + return {NewT, NewTSI}; } static TypeSourceInfo * @@ -3243,8 +3242,19 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state, if (!D.getAttributes().empty()) distributeTypeAttrsFromDeclarator(state, T); + // Find the deduced type in this type. Look in the trailing return type if we + // have one, otherwise in the DeclSpec type. + // FIXME: The standard wording doesn't currently describe this. + DeducedType *Deduced = T->getContainedDeducedType(); + bool DeducedIsTrailingReturnType = false; + if (Deduced && isa(Deduced) && D.hasTrailingReturnType()) { + QualType T = SemaRef.GetTypeFromParser(D.getTrailingReturnType()); + Deduced = T.isNull() ? nullptr : T->getContainedDeducedType(); + DeducedIsTrailingReturnType = true; + } + // C++11 [dcl.spec.auto]p5: reject 'auto' if it is not in an allowed context. - if (DeducedType *Deduced = T->getContainedDeducedType()) { + if (Deduced) { AutoType *Auto = dyn_cast(Deduced); int Error = -1; @@ -3278,10 +3288,6 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state, } else if (!SemaRef.getCurScope()->isFunctionDeclarationScope()) { Error = 21; break; - } else if (D.hasTrailingReturnType()) { - // This might be OK, but we'll need to convert the trailing return - // type later. - break; } Info = &SemaRef.InventedParameterInfos.back(); @@ -3295,7 +3301,12 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state, Info = SemaRef.getCurLambda(); assert(Info && "No LambdaScopeInfo on the stack!"); } - T = InventTemplateParameter(state, T, nullptr, Auto, *Info); + + // We'll deal with inventing template parameters for 'auto' in trailing + // return types when we pick up the trailing return type when processing + // the function chunk. + if (!DeducedIsTrailingReturnType) + T = InventTemplateParameter(state, T, nullptr, Auto, *Info).first; break; } case DeclaratorContext::MemberContext: { @@ -3393,20 +3404,6 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state, (!SemaRef.getLangOpts().CPlusPlus11 || !IsCXXAutoType)) Error = 13; - bool HaveTrailing = false; - - // C++11 [dcl.spec.auto]p2: 'auto' is always fine if the declarator - // contains a trailing return type. That is only legal at the outermost - // level. Check all declarator chunks (outermost first) anyway, to give - // better diagnostics. - // We don't support '__auto_type' with trailing return types. - // FIXME: Should we only do this for 'auto' and not 'decltype(auto)'? - if (SemaRef.getLangOpts().CPlusPlus11 && IsCXXAutoType && - D.hasTrailingReturnType()) { - HaveTrailing = true; - Error = -1; - } - SourceRange AutoRange = D.getDeclSpec().getTypeSpecTypeLoc(); if (D.getName().getKind() == UnqualifiedIdKind::IK_ConversionFunctionId) AutoRange = D.getName().getSourceRange(); @@ -3436,8 +3433,7 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state, T = SemaRef.Context.IntTy; D.setInvalidType(true); - } else if (Auto && !HaveTrailing && - D.getContext() != DeclaratorContext::LambdaExprContext) { + } else if (Auto && D.getContext() != DeclaratorContext::LambdaExprContext) { // If there was a trailing return type, we already got // warn_cxx98_compat_trailing_return_type in the parser. SemaRef.Diag(AutoRange.getBegin(), @@ -4890,12 +4886,21 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, // An error occurred parsing the trailing return type. T = Context.IntTy; D.setInvalidType(true); - } else if (S.getLangOpts().CPlusPlus20) - // Handle cases like: `auto f() -> auto` or `auto f() -> C auto`. - if (AutoType *Auto = T->getContainedAutoType()) - if (S.getCurScope()->isFunctionDeclarationScope()) - T = InventTemplateParameter(state, T, TInfo, Auto, - S.InventedParameterInfos.back()); + } else if (AutoType *Auto = T->getContainedAutoType()) { + // If the trailing return type contains an `auto`, we may need to + // invent a template parameter for it, for cases like + // `auto f() -> C auto` or `[](auto (*p) -> auto) {}`. + InventedTemplateParameterInfo *InventedParamInfo = nullptr; + if (D.getContext() == DeclaratorContext::PrototypeContext) + InventedParamInfo = &S.InventedParameterInfos.back(); + else if (D.getContext() == + DeclaratorContext::LambdaExprParameterContext) + InventedParamInfo = S.getCurLambda(); + if (InventedParamInfo) { + std::tie(T, TInfo) = InventTemplateParameter( + state, T, TInfo, Auto, *InventedParamInfo); + } + } } else { // This function type is not the type of the entity being declared, // so checking the 'auto' is not the responsibility of this chunk. @@ -5131,7 +5136,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, // is an incomplete type (C99 6.2.5p19) and function decls cannot // have parameters of incomplete type. if (FTI.NumParams != 1 || FTI.isVariadic) { - S.Diag(DeclType.Loc, diag::err_void_only_param); + S.Diag(FTI.Params[i].IdentLoc, diag::err_void_only_param); ParamTy = Context.IntTy; Param->setType(ParamTy); } else if (FTI.Params[i].Ident) { @@ -5538,7 +5543,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, << T << D.getSourceRange(); D.setEllipsisLoc(SourceLocation()); } else { - T = Context.getPackExpansionType(T, None); + T = Context.getPackExpansionType(T, None, /*ExpectPackInType=*/false); } break; case DeclaratorContext::TemplateParamContext: diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 641ae24de2c7f..a57e2359feffc 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -2010,26 +2010,32 @@ class TreeTransform { /// /// By default, performs semantic analysis to build the new statement. /// Subclasses may override this routine to provide different behavior. - OMPClause *RebuildOMPToClause(ArrayRef VarList, - CXXScopeSpec &MapperIdScopeSpec, - DeclarationNameInfo &MapperId, - const OMPVarListLocTy &Locs, - ArrayRef UnresolvedMappers) { - return getSema().ActOnOpenMPToClause(VarList, MapperIdScopeSpec, MapperId, - Locs, UnresolvedMappers); + OMPClause * + RebuildOMPToClause(ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, + DeclarationNameInfo &MapperId, SourceLocation ColonLoc, + ArrayRef VarList, const OMPVarListLocTy &Locs, + ArrayRef UnresolvedMappers) { + return getSema().ActOnOpenMPToClause(MotionModifiers, MotionModifiersLoc, + MapperIdScopeSpec, MapperId, ColonLoc, + VarList, Locs, UnresolvedMappers); } /// Build a new OpenMP 'from' clause. /// /// By default, performs semantic analysis to build the new statement. /// Subclasses may override this routine to provide different behavior. - OMPClause *RebuildOMPFromClause(ArrayRef VarList, - CXXScopeSpec &MapperIdScopeSpec, - DeclarationNameInfo &MapperId, - const OMPVarListLocTy &Locs, - ArrayRef UnresolvedMappers) { - return getSema().ActOnOpenMPFromClause(VarList, MapperIdScopeSpec, MapperId, - Locs, UnresolvedMappers); + OMPClause * + RebuildOMPFromClause(ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, + DeclarationNameInfo &MapperId, SourceLocation ColonLoc, + ArrayRef VarList, const OMPVarListLocTy &Locs, + ArrayRef UnresolvedMappers) { + return getSema().ActOnOpenMPFromClause( + MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId, + ColonLoc, VarList, Locs, UnresolvedMappers); } /// Build a new OpenMP 'use_device_ptr' clause. @@ -9745,8 +9751,9 @@ OMPClause *TreeTransform::TransformOMPToClause(OMPToClause *C) { if (transformOMPMappableExprListClause( *this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers)) return nullptr; - return getDerived().RebuildOMPToClause(Vars, MapperIdScopeSpec, MapperIdInfo, - Locs, UnresolvedMappers); + return getDerived().RebuildOMPToClause( + C->getMotionModifiers(), C->getMotionModifiersLoc(), MapperIdScopeSpec, + MapperIdInfo, C->getColonLoc(), Vars, Locs, UnresolvedMappers); } template @@ -9760,7 +9767,8 @@ OMPClause *TreeTransform::TransformOMPFromClause(OMPFromClause *C) { *this, C, Vars, MapperIdScopeSpec, MapperIdInfo, UnresolvedMappers)) return nullptr; return getDerived().RebuildOMPFromClause( - Vars, MapperIdScopeSpec, MapperIdInfo, Locs, UnresolvedMappers); + C->getMotionModifiers(), C->getMotionModifiersLoc(), MapperIdScopeSpec, + MapperIdInfo, C->getColonLoc(), Vars, Locs, UnresolvedMappers); } template diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 9f8d724e237ec..3931c41a2d17b 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12608,8 +12608,14 @@ void OMPClauseReader::VisitOMPDefaultmapClause(OMPDefaultmapClause *C) { void OMPClauseReader::VisitOMPToClause(OMPToClause *C) { C->setLParenLoc(Record.readSourceLocation()); + for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { + C->setMotionModifier( + I, static_cast(Record.readInt())); + C->setMotionModifierLoc(I, Record.readSourceLocation()); + } C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc()); C->setMapperIdInfo(Record.readDeclarationNameInfo()); + C->setColonLoc(Record.readSourceLocation()); auto NumVars = C->varlist_size(); auto UniqueDecls = C->getUniqueDeclarationsNum(); auto TotalLists = C->getTotalComponentListNum(); @@ -12658,8 +12664,14 @@ void OMPClauseReader::VisitOMPToClause(OMPToClause *C) { void OMPClauseReader::VisitOMPFromClause(OMPFromClause *C) { C->setLParenLoc(Record.readSourceLocation()); + for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { + C->setMotionModifier( + I, static_cast(Record.readInt())); + C->setMotionModifierLoc(I, Record.readSourceLocation()); + } C->setMapperQualifierLoc(Record.readNestedNameSpecifierLoc()); C->setMapperIdInfo(Record.readDeclarationNameInfo()); + C->setColonLoc(Record.readSourceLocation()); auto NumVars = C->varlist_size(); auto UniqueDecls = C->getUniqueDeclarationsNum(); auto TotalLists = C->getTotalComponentListNum(); diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 0b87161ddeea3..5413b28ffde25 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -2909,9 +2909,11 @@ static bool isSameTemplateParameter(const NamedDecl *X, return false; if (TX->hasTypeConstraint() != TY->hasTypeConstraint()) return false; - if (TX->hasTypeConstraint()) { - const TypeConstraint *TXTC = TX->getTypeConstraint(); - const TypeConstraint *TYTC = TY->getTypeConstraint(); + const TypeConstraint *TXTC = TX->getTypeConstraint(); + const TypeConstraint *TYTC = TY->getTypeConstraint(); + if (!TXTC != !TYTC) + return false; + if (TXTC && TYTC) { if (TXTC->getNamedConcept() != TYTC->getNamedConcept()) return false; if (TXTC->hasExplicitTemplateArgs() != TYTC->hasExplicitTemplateArgs()) diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 9ea90b2a0212a..8c693b7d34fdd 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6581,8 +6581,13 @@ void OMPClauseWriter::VisitOMPToClause(OMPToClause *C) { Record.push_back(C->getTotalComponentListNum()); Record.push_back(C->getTotalComponentsNum()); Record.AddSourceLocation(C->getLParenLoc()); + for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { + Record.push_back(C->getMotionModifier(I)); + Record.AddSourceLocation(C->getMotionModifierLoc(I)); + } Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc()); Record.AddDeclarationNameInfo(C->getMapperIdInfo()); + Record.AddSourceLocation(C->getColonLoc()); for (auto *E : C->varlists()) Record.AddStmt(E); for (auto *E : C->mapperlists()) @@ -6605,8 +6610,13 @@ void OMPClauseWriter::VisitOMPFromClause(OMPFromClause *C) { Record.push_back(C->getTotalComponentListNum()); Record.push_back(C->getTotalComponentsNum()); Record.AddSourceLocation(C->getLParenLoc()); + for (unsigned I = 0; I < NumberOfOMPMotionModifiers; ++I) { + Record.push_back(C->getMotionModifier(I)); + Record.AddSourceLocation(C->getMotionModifierLoc(I)); + } Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc()); Record.AddDeclarationNameInfo(C->getMapperIdInfo()); + Record.AddSourceLocation(C->getColonLoc()); for (auto *E : C->varlists()) Record.AddStmt(E); for (auto *E : C->mapperlists()) diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp index 13836f08a61ef..78b3c209ad6bc 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp @@ -406,7 +406,7 @@ ProgramStateRef ObjCDeallocChecker::evalAssume(ProgramStateRef State, SVal Cond, if (State->get().isEmpty()) return State; - auto *CondBSE = dyn_cast_or_null(Cond.getAsSymExpr()); + auto *CondBSE = dyn_cast_or_null(Cond.getAsSymbol()); if (!CondBSE) return State; diff --git a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp index 2411f0e2d058f..9a87729de8fda 100644 --- a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp @@ -30,8 +30,9 @@ class DereferenceChecker : public Checker< check::Location, check::Bind, EventDispatcher > { - mutable std::unique_ptr BT_null; - mutable std::unique_ptr BT_undef; + BugType BT_Null{this, "Dereference of null pointer", categories::LogicError}; + BugType BT_Undef{this, "Dereference of undefined pointer value", + categories::LogicError}; void reportBug(ProgramStateRef State, const Stmt *S, CheckerContext &C) const; @@ -123,11 +124,6 @@ void DereferenceChecker::reportBug(ProgramStateRef State, const Stmt *S, if (!N) return; - // We know that 'location' cannot be non-null. This is what - // we call an "explicit" null dereference. - if (!BT_null) - BT_null.reset(new BuiltinBug(this, "Dereference of null pointer")); - SmallString<100> buf; llvm::raw_svector_ostream os(buf); @@ -180,7 +176,7 @@ void DereferenceChecker::reportBug(ProgramStateRef State, const Stmt *S, } auto report = std::make_unique( - *BT_null, buf.empty() ? BT_null->getDescription() : StringRef(buf), N); + BT_Null, buf.empty() ? BT_Null.getDescription() : StringRef(buf), N); bugreporter::trackExpressionValue(N, bugreporter::getDerefExpr(S), *report); @@ -196,12 +192,8 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S, // Check for dereference of an undefined value. if (l.isUndef()) { if (ExplodedNode *N = C.generateErrorNode()) { - if (!BT_undef) - BT_undef.reset( - new BuiltinBug(this, "Dereference of undefined pointer value")); - auto report = std::make_unique( - *BT_undef, BT_undef->getDescription(), N); + BT_Undef, BT_Undef.getDescription(), N); bugreporter::trackExpressionValue(N, bugreporter::getDerefExpr(S), *report); C.emitReport(std::move(report)); } @@ -219,9 +211,10 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S, ProgramStateRef notNullState, nullState; std::tie(notNullState, nullState) = state->assume(location); - // The explicit NULL case. if (nullState) { if (!notNullState) { + // We know that 'location' can only be null. This is what + // we call an "explicit" null dereference. const Expr *expr = getDereferenceExpr(S); if (!suppressReport(expr)) { reportBug(nullState, expr, C); diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp index 632de9e5dc832..ab5e6a1c9991f 100644 --- a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp @@ -109,7 +109,7 @@ class IteratorModeling bool Postfix) const; void handleRandomIncrOrDecr(CheckerContext &C, const Expr *CE, OverloadedOperatorKind Op, const SVal &RetVal, - const SVal &LHS, const SVal &RHS) const; + const SVal &Iterator, const SVal &Amount) const; void handlePtrIncrOrDecr(CheckerContext &C, const Expr *Iterator, OverloadedOperatorKind OK, SVal Offset) const; void handleAdvance(CheckerContext &C, const Expr *CE, SVal RetVal, SVal Iter, @@ -262,20 +262,30 @@ void IteratorModeling::checkPostStmt(const UnaryOperator *UO, void IteratorModeling::checkPostStmt(const BinaryOperator *BO, CheckerContext &C) const { - ProgramStateRef State = C.getState(); - BinaryOperatorKind OK = BO->getOpcode(); - SVal RVal = State->getSVal(BO->getRHS(), C.getLocationContext()); + const ProgramStateRef State = C.getState(); + const BinaryOperatorKind OK = BO->getOpcode(); + const Expr *const LHS = BO->getLHS(); + const Expr *const RHS = BO->getRHS(); + const SVal LVal = State->getSVal(LHS, C.getLocationContext()); + const SVal RVal = State->getSVal(RHS, C.getLocationContext()); if (isSimpleComparisonOperator(BO->getOpcode())) { - SVal LVal = State->getSVal(BO->getLHS(), C.getLocationContext()); SVal Result = State->getSVal(BO, C.getLocationContext()); handleComparison(C, BO, Result, LVal, RVal, BinaryOperator::getOverloadedOperator(OK)); } else if (isRandomIncrOrDecrOperator(OK)) { - if (!BO->getRHS()->getType()->isIntegralOrEnumerationType()) + // In case of operator+ the iterator can be either on the LHS (eg.: it + 1), + // or on the RHS (eg.: 1 + it). Both cases are modeled. + const bool IsIterOnLHS = BO->getLHS()->getType()->isPointerType(); + const Expr *const &IterExpr = IsIterOnLHS ? LHS : RHS; + const Expr *const &AmountExpr = IsIterOnLHS ? RHS : LHS; + + // The non-iterator side must have an integral or enumeration type. + if (!AmountExpr->getType()->isIntegralOrEnumerationType()) return; - handlePtrIncrOrDecr(C, BO->getLHS(), - BinaryOperator::getOverloadedOperator(OK), RVal); + const SVal &AmountVal = IsIterOnLHS ? RVal : LVal; + handlePtrIncrOrDecr(C, IterExpr, BinaryOperator::getOverloadedOperator(OK), + AmountVal); } } @@ -368,11 +378,24 @@ IteratorModeling::handleOverloadedOperator(CheckerContext &C, InstCall->getCXXThisVal(), Call.getArgSVal(0)); return; } - } else { - if (Call.getNumArgs() >= 2 && - Call.getArgExpr(1)->getType()->isIntegralOrEnumerationType()) { + } else if (Call.getNumArgs() >= 2) { + const Expr *FirstArg = Call.getArgExpr(0); + const Expr *SecondArg = Call.getArgExpr(1); + const QualType FirstType = FirstArg->getType(); + const QualType SecondType = SecondArg->getType(); + + if (FirstType->isIntegralOrEnumerationType() || + SecondType->isIntegralOrEnumerationType()) { + // In case of operator+ the iterator can be either on the LHS (eg.: + // it + 1), or on the RHS (eg.: 1 + it). Both cases are modeled. + const bool IsIterFirst = FirstType->isStructureOrClassType(); + const SVal FirstArg = Call.getArgSVal(0); + const SVal SecondArg = Call.getArgSVal(1); + const SVal &Iterator = IsIterFirst ? FirstArg : SecondArg; + const SVal &Amount = IsIterFirst ? SecondArg : FirstArg; + handleRandomIncrOrDecr(C, OrigExpr, Op, Call.getReturnValue(), - Call.getArgSVal(0), Call.getArgSVal(1)); + Iterator, Amount); return; } } @@ -564,35 +587,35 @@ void IteratorModeling::handleDecrement(CheckerContext &C, const SVal &RetVal, C.addTransition(State); } -void IteratorModeling::handleRandomIncrOrDecr(CheckerContext &C, - const Expr *CE, +void IteratorModeling::handleRandomIncrOrDecr(CheckerContext &C, const Expr *CE, OverloadedOperatorKind Op, const SVal &RetVal, - const SVal &LHS, - const SVal &RHS) const { + const SVal &Iterator, + const SVal &Amount) const { // Increment or decrement the symbolic expressions which represents the // position of the iterator auto State = C.getState(); - const auto *Pos = getIteratorPosition(State, LHS); + const auto *Pos = getIteratorPosition(State, Iterator); if (!Pos) return; - const auto *value = &RHS; - SVal val; - if (auto loc = RHS.getAs()) { - val = State->getRawSVal(*loc); - value = &val; + const auto *Value = &Amount; + SVal Val; + if (auto LocAmount = Amount.getAs()) { + Val = State->getRawSVal(*LocAmount); + Value = &Val; } - auto &TgtVal = (Op == OO_PlusEqual || Op == OO_MinusEqual) ? LHS : RetVal; + const auto &TgtVal = + (Op == OO_PlusEqual || Op == OO_MinusEqual) ? Iterator : RetVal; // `AdvancedState` is a state where the position of `LHS` is advanced. We // only need this state to retrieve the new position, but we do not want // to change the position of `LHS` (in every case). - auto AdvancedState = advancePosition(State, LHS, Op, *value); + auto AdvancedState = advancePosition(State, Iterator, Op, *Value); if (AdvancedState) { - const auto *NewPos = getIteratorPosition(AdvancedState, LHS); + const auto *NewPos = getIteratorPosition(AdvancedState, Iterator); assert(NewPos && "Iterator should have position after successful advancement"); diff --git a/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp index 87477e96d2d16..a157ee2da5df4 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp @@ -509,7 +509,7 @@ ProgramStateRef MacOSKeychainAPIChecker::evalAssume(ProgramStateRef State, if (AMap.isEmpty()) return State; - auto *CondBSE = dyn_cast_or_null(Cond.getAsSymExpr()); + auto *CondBSE = dyn_cast_or_null(Cond.getAsSymbol()); if (!CondBSE) return State; BinaryOperator::Opcode OpCode = CondBSE->getOpcode(); diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp index 1d8ed90f7590c..854646a8779d7 100644 --- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountDiagnostics.cpp @@ -177,7 +177,7 @@ static Optional findArgIdxOfSymbol(ProgramStateRef CurrSt, for (unsigned Idx = 0; Idx < (*CE)->getNumArgs(); Idx++) if (const MemRegion *MR = (*CE)->getArgSVal(Idx).getAsRegion()) if (const auto *TR = dyn_cast(MR)) - if (CurrSt->getSVal(MR, TR->getValueType()).getAsSymExpr() == Sym) + if (CurrSt->getSVal(MR, TR->getValueType()).getAsSymbol() == Sym) return Idx; return None; diff --git a/clang/lib/StaticAnalyzer/Checkers/Taint.cpp b/clang/lib/StaticAnalyzer/Checkers/Taint.cpp index 5b46ffb656cf8..71b2ab834a07a 100644 --- a/clang/lib/StaticAnalyzer/Checkers/Taint.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/Taint.cpp @@ -148,7 +148,7 @@ bool taint::isTainted(ProgramStateRef State, const Stmt *S, } bool taint::isTainted(ProgramStateRef State, SVal V, TaintTagType Kind) { - if (const SymExpr *Sym = V.getAsSymExpr()) + if (SymbolRef Sym = V.getAsSymbol()) return isTainted(State, Sym, Kind); if (const MemRegion *Reg = V.getAsRegion()) return isTainted(State, Reg, Kind); diff --git a/clang/lib/StaticAnalyzer/Core/Environment.cpp b/clang/lib/StaticAnalyzer/Core/Environment.cpp index 9e6d79bb7dcc9..1ccf4c6104a65 100644 --- a/clang/lib/StaticAnalyzer/Core/Environment.cpp +++ b/clang/lib/StaticAnalyzer/Core/Environment.cpp @@ -183,18 +183,12 @@ EnvironmentManager::removeDeadBindings(Environment Env, F.getTreeFactory()); // Iterate over the block-expr bindings. - for (Environment::iterator I = Env.begin(), E = Env.end(); I != E; ++I) { + for (Environment::iterator I = Env.begin(), E = Env.end(); + I != E; ++I) { const EnvironmentEntry &BlkExpr = I.getKey(); const SVal &X = I.getData(); - const bool IsBlkExprLive = - SymReaper.isLive(BlkExpr.getStmt(), BlkExpr.getLocationContext()); - - assert((isa(BlkExpr.getStmt()) || !IsBlkExprLive) && - "Only Exprs can be live, LivenessAnalysis argues about the liveness " - "of *values*!"); - - if (IsBlkExprLive) { + if (SymReaper.isLive(BlkExpr.getStmt(), BlkExpr.getLocationContext())) { // Copy the binding to the new map. EBMapRef = EBMapRef.add(BlkExpr, X); diff --git a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp index 006a4006b7fc9..1ccb0de92fba3 100644 --- a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp +++ b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp @@ -582,9 +582,6 @@ bool ScanReachableSymbols::scan(SVal val) { if (SymbolRef Sym = val.getAsSymbol()) return scan(Sym); - if (const SymExpr *Sym = val.getAsSymbolicExpression()) - return scan(Sym); - if (Optional X = val.getAs()) return scan(*X); diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp index c00a2c8ba8a2c..5b6b6973b310c 100644 --- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp +++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp @@ -377,8 +377,8 @@ Optional SValBuilder::getConstantVal(const Expr *E) { SVal SValBuilder::makeSymExprValNN(BinaryOperator::Opcode Op, NonLoc LHS, NonLoc RHS, QualType ResultTy) { - const SymExpr *symLHS = LHS.getAsSymExpr(); - const SymExpr *symRHS = RHS.getAsSymExpr(); + SymbolRef symLHS = LHS.getAsSymbol(); + SymbolRef symRHS = RHS.getAsSymbol(); // TODO: When the Max Complexity is reached, we should conjure a symbol // instead of generating an Unknown value and propagate the taint info to it. @@ -492,7 +492,7 @@ SVal SValBuilder::evalIntegralCast(ProgramStateRef state, SVal val, if (getContext().getTypeSize(castTy) >= getContext().getTypeSize(originalTy)) return evalCast(val, castTy, originalTy); - const SymExpr *se = val.getAsSymbolicExpression(); + SymbolRef se = val.getAsSymbol(); if (!se) // Let evalCast handle non symbolic expressions. return evalCast(val, castTy, originalTy); diff --git a/clang/lib/StaticAnalyzer/Core/SVals.cpp b/clang/lib/StaticAnalyzer/Core/SVals.cpp index 9b5de6c3eb92b..465800fa67fce 100644 --- a/clang/lib/StaticAnalyzer/Core/SVals.cpp +++ b/clang/lib/StaticAnalyzer/Core/SVals.cpp @@ -116,8 +116,6 @@ SymbolRef SVal::getLocSymbolInBase() const { return nullptr; } -// TODO: The next 3 functions have to be simplified. - /// If this SVal wraps a symbol return that SymbolRef. /// Otherwise, return 0. /// @@ -132,22 +130,6 @@ SymbolRef SVal::getAsSymbol(bool IncludeBaseRegions) const { return getAsLocSymbol(IncludeBaseRegions); } -/// getAsSymbolicExpression - If this Sval wraps a symbolic expression then -/// return that expression. Otherwise return NULL. -const SymExpr *SVal::getAsSymbolicExpression() const { - if (Optional X = getAs()) - return X->getSymbol(); - - return getAsSymbol(); -} - -const SymExpr* SVal::getAsSymExpr() const { - const SymExpr* Sym = getAsSymbol(); - if (!Sym) - Sym = getAsSymbolicExpression(); - return Sym; -} - const MemRegion *SVal::getAsRegion() const { if (Optional X = getAs()) return X->getRegion(); diff --git a/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp index 3709106ad44ce..f96974f97dcc5 100644 --- a/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp @@ -57,7 +57,7 @@ ProgramStateRef SimpleConstraintManager::assumeAux(ProgramStateRef State, // SymIntExprs. if (!canReasonAbout(Cond)) { // Just add the constraint to the expression without trying to simplify. - SymbolRef Sym = Cond.getAsSymExpr(); + SymbolRef Sym = Cond.getAsSymbol(); assert(Sym); return assumeSymUnsupported(State, Sym, Assumption); } @@ -101,7 +101,7 @@ ProgramStateRef SimpleConstraintManager::assumeInclusiveRange( if (!canReasonAbout(Value)) { // Just add the constraint to the expression without trying to simplify. - SymbolRef Sym = Value.getAsSymExpr(); + SymbolRef Sym = Value.getAsSymbol(); assert(Sym); return assumeSymInclusiveRange(State, Sym, From, To, InRange); } diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp index 2e269f6a596e8..a64ed78ac3458 100644 --- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp +++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp @@ -86,7 +86,7 @@ SVal SimpleSValBuilder::evalCastFromNonLoc(NonLoc val, QualType castTy) { return makeLocAsInteger(LI->getLoc(), castSize); } - if (const SymExpr *se = val.getAsSymbolicExpression()) { + if (SymbolRef se = val.getAsSymbol()) { QualType T = Context.getCanonicalType(se->getType()); // If types are the same or both are integers, ignore the cast. // FIXME: Remove this hack when we support symbolic truncation/extension. diff --git a/clang/lib/Tooling/Syntax/Tree.cpp b/clang/lib/Tooling/Syntax/Tree.cpp index 37579e6145b65..2944819afc96c 100644 --- a/clang/lib/Tooling/Syntax/Tree.cpp +++ b/clang/lib/Tooling/Syntax/Tree.cpp @@ -33,8 +33,8 @@ static void traverse(syntax::Node *N, } // namespace syntax::Arena::Arena(SourceManager &SourceMgr, const LangOptions &LangOpts, - TokenBuffer Tokens) - : SourceMgr(SourceMgr), LangOpts(LangOpts), Tokens(std::move(Tokens)) {} + const TokenBuffer &Tokens) + : SourceMgr(SourceMgr), LangOpts(LangOpts), Tokens(Tokens) {} const clang::syntax::TokenBuffer &syntax::Arena::tokenBuffer() const { return Tokens; diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index 40b6cff0d627a..1ee8ce28c2efa 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -245,27 +245,38 @@ std::string getAbsolutePath(StringRef File) { void addTargetAndModeForProgramName(std::vector &CommandLine, StringRef InvokedAs) { - if (!CommandLine.empty() && !InvokedAs.empty()) { - bool AlreadyHasTarget = false; - bool AlreadyHasMode = false; - // Skip CommandLine[0]. - for (auto Token = ++CommandLine.begin(); Token != CommandLine.end(); - ++Token) { - StringRef TokenRef(*Token); - AlreadyHasTarget |= - (TokenRef == "-target" || TokenRef.startswith("-target=")); - AlreadyHasMode |= (TokenRef == "--driver-mode" || - TokenRef.startswith("--driver-mode=")); - } - auto TargetMode = - driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs); - if (!AlreadyHasMode && TargetMode.DriverMode) { - CommandLine.insert(++CommandLine.begin(), TargetMode.DriverMode); - } - if (!AlreadyHasTarget && TargetMode.TargetIsValid) { - CommandLine.insert(++CommandLine.begin(), {"-target", - TargetMode.TargetPrefix}); - } + if (CommandLine.empty() || InvokedAs.empty()) + return; + const auto &Table = driver::getDriverOptTable(); + // --target=X + const std::string TargetOPT = + Table.getOption(driver::options::OPT_target).getPrefixedName(); + // -target X + const std::string TargetOPTLegacy = + Table.getOption(driver::options::OPT_target_legacy_spelling) + .getPrefixedName(); + // --driver-mode=X + const std::string DriverModeOPT = + Table.getOption(driver::options::OPT_driver_mode).getPrefixedName(); + auto TargetMode = + driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs); + // No need to search for target args if we don't have a target/mode to insert. + bool ShouldAddTarget = TargetMode.TargetIsValid; + bool ShouldAddMode = TargetMode.DriverMode != nullptr; + // Skip CommandLine[0]. + for (auto Token = ++CommandLine.begin(); Token != CommandLine.end(); + ++Token) { + StringRef TokenRef(*Token); + ShouldAddTarget = ShouldAddTarget && !TokenRef.startswith(TargetOPT) && + !TokenRef.equals(TargetOPTLegacy); + ShouldAddMode = ShouldAddMode && !TokenRef.startswith(DriverModeOPT); + } + if (ShouldAddMode) { + CommandLine.insert(++CommandLine.begin(), TargetMode.DriverMode); + } + if (ShouldAddTarget) { + CommandLine.insert(++CommandLine.begin(), + TargetOPT + TargetMode.TargetPrefix); } } diff --git a/clang/lib/Tooling/Transformer/Parsing.cpp b/clang/lib/Tooling/Transformer/Parsing.cpp index 1579115b93138..fb5fd4a800bbb 100644 --- a/clang/lib/Tooling/Transformer/Parsing.cpp +++ b/clang/lib/Tooling/Transformer/Parsing.cpp @@ -109,14 +109,14 @@ getUnaryRangeSelectors() { static const llvm::StringMap> & getBinaryStringSelectors() { static const llvm::StringMap> M = { - {"encloseNodes", range}}; + {"encloseNodes", encloseNodes}}; return M; } static const llvm::StringMap> & getBinaryRangeSelectors() { static const llvm::StringMap> - M = {{"enclose", range}}; + M = {{"enclose", enclose}, {"between", between}}; return M; } diff --git a/clang/lib/Tooling/Transformer/RewriteRule.cpp b/clang/lib/Tooling/Transformer/RewriteRule.cpp index a212a868c81d8..ce773b59a7e7e 100644 --- a/clang/lib/Tooling/Transformer/RewriteRule.cpp +++ b/clang/lib/Tooling/Transformer/RewriteRule.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "clang/Tooling/Transformer/RewriteRule.h" +#include "clang/AST/ASTTypeTraits.h" +#include "clang/AST/Stmt.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Basic/SourceLocation.h" @@ -68,6 +70,24 @@ EditGenerator transformer::edit(ASTEdit Edit) { }; } +EditGenerator +transformer::flattenVector(SmallVector Generators) { + if (Generators.size() == 1) + return std::move(Generators[0]); + return + [Gs = std::move(Generators)]( + const MatchResult &Result) -> llvm::Expected> { + SmallVector AllEdits; + for (const auto &G : Gs) { + llvm::Expected> Edits = G(Result); + if (!Edits) + return Edits.takeError(); + AllEdits.append(Edits->begin(), Edits->end()); + } + return AllEdits; + }; +} + ASTEdit transformer::changeTo(RangeSelector Target, TextGenerator Replacement) { ASTEdit E; E.TargetRange = std::move(Target); @@ -97,15 +117,144 @@ ASTEdit transformer::remove(RangeSelector S) { return change(std::move(S), std::make_shared("")); } -RewriteRule transformer::makeRule(ast_matchers::internal::DynTypedMatcher M, - EditGenerator Edits, +RewriteRule transformer::makeRule(DynTypedMatcher M, EditGenerator Edits, TextGenerator Explanation) { return RewriteRule{{RewriteRule::Case{ std::move(M), std::move(Edits), std::move(Explanation), {}}}}; } +namespace { + +/// Unconditionally binds the given node set before trying `InnerMatcher` and +/// keeps the bound nodes on a successful match. +template +class BindingsMatcher : public ast_matchers::internal::MatcherInterface { + ast_matchers::BoundNodes Nodes; + const ast_matchers::internal::Matcher InnerMatcher; + +public: + explicit BindingsMatcher(ast_matchers::BoundNodes Nodes, + ast_matchers::internal::Matcher InnerMatcher) + : Nodes(std::move(Nodes)), InnerMatcher(std::move(InnerMatcher)) {} + + bool matches( + const T &Node, ast_matchers::internal::ASTMatchFinder *Finder, + ast_matchers::internal::BoundNodesTreeBuilder *Builder) const override { + ast_matchers::internal::BoundNodesTreeBuilder Result(*Builder); + for (const auto &N : Nodes.getMap()) + Result.setBinding(N.first, N.second); + if (InnerMatcher.matches(Node, Finder, &Result)) { + *Builder = std::move(Result); + return true; + } + return false; + } +}; + +/// Matches nodes of type T that have at least one descendant node for which the +/// given inner matcher matches. Will match for each descendant node that +/// matches. Based on ForEachDescendantMatcher, but takes a dynamic matcher, +/// instead of a static one, because it is used by RewriteRule, which carries +/// (only top-level) dynamic matchers. +template +class DynamicForEachDescendantMatcher + : public ast_matchers::internal::MatcherInterface { + const DynTypedMatcher DescendantMatcher; + +public: + explicit DynamicForEachDescendantMatcher(DynTypedMatcher DescendantMatcher) + : DescendantMatcher(std::move(DescendantMatcher)) {} + + bool matches( + const T &Node, ast_matchers::internal::ASTMatchFinder *Finder, + ast_matchers::internal::BoundNodesTreeBuilder *Builder) const override { + return Finder->matchesDescendantOf( + Node, this->DescendantMatcher, Builder, + ast_matchers::internal::ASTMatchFinder::BK_All); + } +}; + +template +ast_matchers::internal::Matcher +forEachDescendantDynamically(ast_matchers::BoundNodes Nodes, + DynTypedMatcher M) { + return ast_matchers::internal::makeMatcher(new BindingsMatcher( + std::move(Nodes), + ast_matchers::internal::makeMatcher( + new DynamicForEachDescendantMatcher(std::move(M))))); +} + +class ApplyRuleCallback : public MatchFinder::MatchCallback { +public: + ApplyRuleCallback(RewriteRule Rule) : Rule(std::move(Rule)) {} + + template + void registerMatchers(const ast_matchers::BoundNodes &Nodes, + MatchFinder *MF) { + for (auto &Matcher : transformer::detail::buildMatchers(Rule)) + MF->addMatcher(forEachDescendantDynamically(Nodes, Matcher), this); + } + + void run(const MatchFinder::MatchResult &Result) override { + if (!Edits) + return; + transformer::RewriteRule::Case Case = + transformer::detail::findSelectedCase(Result, Rule); + auto Transformations = Case.Edits(Result); + if (!Transformations) { + Edits = Transformations.takeError(); + return; + } + Edits->append(Transformations->begin(), Transformations->end()); + } + + RewriteRule Rule; + + // Initialize to a non-error state. + Expected> Edits = SmallVector(); +}; +} // namespace + +template +llvm::Expected> +rewriteDescendantsImpl(const T &Node, RewriteRule Rule, + const MatchResult &Result) { + ApplyRuleCallback Callback(std::move(Rule)); + MatchFinder Finder; + Callback.registerMatchers(Result.Nodes, &Finder); + Finder.match(Node, *Result.Context); + return std::move(Callback.Edits); +} + +EditGenerator transformer::rewriteDescendants(std::string NodeId, + RewriteRule Rule) { + // FIXME: warn or return error if `Rule` contains any `AddedIncludes`, since + // these will be dropped. + return [NodeId = std::move(NodeId), + Rule = std::move(Rule)](const MatchResult &Result) + -> llvm::Expected> { + const ast_matchers::BoundNodes::IDToNodeMap &NodesMap = + Result.Nodes.getMap(); + auto It = NodesMap.find(NodeId); + if (It == NodesMap.end()) + return llvm::make_error(llvm::errc::invalid_argument, + "ID not bound: " + NodeId); + if (auto *Node = It->second.get()) + return rewriteDescendantsImpl(*Node, std::move(Rule), Result); + if (auto *Node = It->second.get()) + return rewriteDescendantsImpl(*Node, std::move(Rule), Result); + if (auto *Node = It->second.get()) + return rewriteDescendantsImpl(*Node, std::move(Rule), Result); + + return llvm::make_error( + llvm::errc::invalid_argument, + "type unsupported for recursive rewriting, ID=\"" + NodeId + + "\", Kind=" + It->second.getNodeKind().asStringRef()); + }; +} + void transformer::addInclude(RewriteRule &Rule, StringRef Header, - IncludeFormat Format) { + IncludeFormat Format) { for (auto &Case : Rule.Cases) Case.AddedIncludes.emplace_back(Header.str(), Format); } diff --git a/clang/lib/Tooling/Transformer/Transformer.cpp b/clang/lib/Tooling/Transformer/Transformer.cpp index e8fc00c4e953f..5b5be7a396db4 100644 --- a/clang/lib/Tooling/Transformer/Transformer.cpp +++ b/clang/lib/Tooling/Transformer/Transformer.cpp @@ -38,13 +38,8 @@ void Transformer::run(const MatchFinder::MatchResult &Result) { return; } - if (Transformations->empty()) { - // No rewrite applied (but no error encountered either). - transformer::detail::getRuleMatchLoc(Result).print( - llvm::errs() << "note: skipping match at loc ", *Result.SourceManager); - llvm::errs() << "\n"; + if (Transformations->empty()) return; - } // Group the transformations, by file, into AtomicChanges, each anchored by // the location of the first change in that file. diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp new file mode 100644 index 0000000000000..7050ee0fb4492 --- /dev/null +++ b/clang/test/AST/ast-dump-concepts.cpp @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++2a -ast-dump -ast-dump-filter Foo %s | FileCheck -strict-whitespace %s + +// Test with serialization: +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -x c++ -std=c++20 -triple x86_64-unknown-unknown -include-pch %t \ +// RUN: -ast-dump-all -ast-dump-filter Foo /dev/null \ +// RUN: | FileCheck --strict-whitespace %s + +template +concept unary_concept = true; + +template +concept binary_concept = true; + +template +struct Foo { + // CHECK: TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept' + // CHECK-NEXT: |-ConceptSpecializationExpr {{.*}} 'bool' + // CHECK-NEXT: `-TemplateArgument {{.*}} type 'int' + template R> + Foo(R); + + // CHECK: TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'unary_concept' + // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} 'bool' + template + Foo(R); + + // CHECK: FunctionTemplateDecl {{.*}} {{.*}} Foo + template + Foo(R, int) requires unary_concept; + + // CHECK: FunctionTemplateDecl {{.*}} {{.*}} Foo + template + Foo(R, char) requires unary_concept { + } +}; diff --git a/clang/test/Analysis/Inputs/ctu-import.c b/clang/test/Analysis/Inputs/ctu-import.c new file mode 100644 index 0000000000000..6c99a36427978 --- /dev/null +++ b/clang/test/Analysis/Inputs/ctu-import.c @@ -0,0 +1,15 @@ + +// Use an internal, implicitly defined type, called by +// a function imported for CTU. This should not crash. +int foo(void); +int foobar(int skip) { + __NSConstantString str = {.flags = 1}; + + if (str.flags >= 0) + str.flags = 0; + return 4; +} + +int testStaticImplicit(void) { + return foobar(3); +} diff --git a/clang/test/Analysis/Inputs/ctu-import.c.externalDefMap.ast-dump.txt b/clang/test/Analysis/Inputs/ctu-import.c.externalDefMap.ast-dump.txt new file mode 100644 index 0000000000000..83d3b4ca451e8 --- /dev/null +++ b/clang/test/Analysis/Inputs/ctu-import.c.externalDefMap.ast-dump.txt @@ -0,0 +1 @@ +c:@F@testStaticImplicit ctu-import.c.ast diff --git a/clang/test/Analysis/ctu-implicit.c b/clang/test/Analysis/ctu-implicit.c new file mode 100644 index 0000000000000..925044845e090 --- /dev/null +++ b/clang/test/Analysis/ctu-implicit.c @@ -0,0 +1,20 @@ +// RUN: rm -rf %t && mkdir %t +// RUN: mkdir -p %t/ctudir2 +// RUN: %clang_cc1 \ +// RUN: -emit-pch -o %t/ctudir2/ctu-import.c.ast %S/Inputs/ctu-import.c +// RUN: cp %S/Inputs/ctu-import.c.externalDefMap.ast-dump.txt %t/ctudir2/externalDefMap.txt +// RUN: %clang_cc1 -analyze \ +// RUN: -analyzer-checker=core,debug.ExprInspection \ +// RUN: -analyzer-config experimental-enable-naive-ctu-analysis=true \ +// RUN: -analyzer-config display-ctu-progress=true \ +// RUN: -analyzer-config ctu-dir=%t/ctudir2 \ +// RUN: -verify %s + +void clang_analyzer_eval(int); + +int testStaticImplicit(void); +int func(void) { + int ret = testStaticImplicit(); + clang_analyzer_eval(ret == 4); // expected-warning{{TRUE}} + return testStaticImplicit(); +} diff --git a/clang/test/Analysis/fuchsia_handle.cpp b/clang/test/Analysis/fuchsia_handle.cpp index dade5261bd78e..d104b13c77abf 100644 --- a/clang/test/Analysis/fuchsia_handle.cpp +++ b/clang/test/Analysis/fuchsia_handle.cpp @@ -77,7 +77,9 @@ void handleDieBeforeErrorSymbol01() { void handleDieBeforeErrorSymbol02() { zx_handle_t sa, sb; zx_status_t status = zx_channel_create(0, &sa, &sb); - // expected-note@-1 {{Handle allocated through 2nd parameter}} + // FIXME: There appears to be non-determinism in choosing + // which handle to report. + // expected-note-re@-3 {{Handle allocated through {{(2nd|3rd)}} parameter}} if (status == 0) { // expected-note {{Assuming 'status' is equal to 0}} // expected-note@-1 {{Taking true branch}} return; // expected-warning {{Potential leak of handle}} diff --git a/clang/test/Analysis/iterator-modeling.cpp b/clang/test/Analysis/iterator-modeling.cpp index 0b76b0bfa7232..f1538839d06c8 100644 --- a/clang/test/Analysis/iterator-modeling.cpp +++ b/clang/test/Analysis/iterator-modeling.cpp @@ -149,7 +149,7 @@ void copy(const std::vector &v) { clang_analyzer_express(clang_analyzer_iterator_position(i2)); // expected-warning-re {{$v.end(){{$}}}} } -void plus(const std::vector &v) { +void plus_lhs(const std::vector &v) { auto i1 = v.begin(); clang_analyzer_denote(clang_analyzer_container_begin(v), "$v.begin()"); @@ -161,7 +161,19 @@ void plus(const std::vector &v) { clang_analyzer_express(clang_analyzer_iterator_position(i2)); // expected-warning-re{{$v.begin() + 2{{$}}}} } -void plus_negative(const std::vector &v) { +void plus_rhs(const std::vector &v) { + auto i1 = v.begin(); + + clang_analyzer_denote(clang_analyzer_container_begin(v), "$v.begin()"); + + auto i2 = 2 + i1; + + clang_analyzer_eval(clang_analyzer_iterator_container(i2) == &v); // expected-warning{{TRUE}} + clang_analyzer_express(clang_analyzer_iterator_position(i1)); // expected-warning-re{{$v.begin(){{$}}}} + clang_analyzer_express(clang_analyzer_iterator_position(i2)); // expected-warning-re{{$v.begin() + 2{{$}}}} +} + +void plus_lhs_negative(const std::vector &v) { auto i1 = v.end(); clang_analyzer_denote(clang_analyzer_container_end(v), "$v.end()"); @@ -173,6 +185,18 @@ void plus_negative(const std::vector &v) { clang_analyzer_express(clang_analyzer_iterator_position(i2)); // expected-warning-re {{$v.end() - 2{{$}}}} } +void plus_rhs_negative(const std::vector &v) { + auto i1 = v.end(); + + clang_analyzer_denote(clang_analyzer_container_end(v), "$v.end()"); + + auto i2 = (-2) + i1; + + clang_analyzer_eval(clang_analyzer_iterator_container(i2) == &v); // expected-warning{{TRUE}} + clang_analyzer_express(clang_analyzer_iterator_position(i1)); // expected-warning-re {{$v.end(){{$}}}} + clang_analyzer_express(clang_analyzer_iterator_position(i2)); // expected-warning-re {{$v.end() - 2{{$}}}} +} + void minus(const std::vector &v) { auto i1 = v.end(); @@ -1955,7 +1979,7 @@ void minus_equal_ptr_iterator_variable(const cont_with_ptr_iterator &c, i -= n; // no-crash } -void plus_ptr_iterator(const cont_with_ptr_iterator &c) { +void plus_lhs_ptr_iterator(const cont_with_ptr_iterator &c) { auto i1 = c.begin(); clang_analyzer_denote(clang_analyzer_container_begin(c), "$c.begin()"); @@ -1967,6 +1991,18 @@ void plus_ptr_iterator(const cont_with_ptr_iterator &c) { clang_analyzer_express(clang_analyzer_iterator_position(i2)); // expected-warning{{$c.begin() + 2}} } +void plus_rhs_ptr_iterator(const cont_with_ptr_iterator &c) { + auto i1 = c.begin(); + + clang_analyzer_denote(clang_analyzer_container_begin(c), "$c.begin()"); + + auto i2 = 2 + i1; + + clang_analyzer_eval(clang_analyzer_iterator_container(i2) == &c); // expected-warning{{TRUE}} + clang_analyzer_express(clang_analyzer_iterator_position(i1)); // expected-warning{{$c.begin()}} + clang_analyzer_express(clang_analyzer_iterator_position(i2)); // expected-warning{{$c.begin() + 2}} +} + void minus_ptr_iterator(const cont_with_ptr_iterator &c) { auto i1 = c.end(); diff --git a/clang/test/Analysis/live-stmts.mm b/clang/test/Analysis/live-stmts.mm new file mode 100644 index 0000000000000..a6ddd03ca5d85 --- /dev/null +++ b/clang/test/Analysis/live-stmts.mm @@ -0,0 +1,101 @@ +// RUN: %clang_analyze_cc1 -w -fblocks %s \ +// RUN: -analyzer-checker=debug.DumpLiveStmts \ +// RUN: 2>&1 | FileCheck %s + +@interface Item +// ... +@end + +@interface Collection +// ... +@end + +typedef void (^Blk)(); + +struct RAII { + Blk blk; + +public: + RAII(Blk blk): blk(blk) {} + +// CHECK: [ B0 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B1 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B2 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: + + ~RAII() { blk(); } + +// CHECK-NEXT: [ B0 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B1 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B2 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +}; + +void foo(Collection *coll) { + RAII raii(^{}); + for (Item *item in coll) {} +} +// CHECK-NEXT: [ B0 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B1 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B2 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-NEXT: DeclStmt {{.*}} +// CHECK-NEXT: `-VarDecl {{.*}} item 'Item *' +// CHECK-EMPTY: +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *' +// CHECK-EMPTY: +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B3 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-NEXT: DeclStmt {{.*}} +// CHECK-NEXT: `-VarDecl {{.*}} item 'Item *' +// CHECK-EMPTY: +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *' +// CHECK-EMPTY: +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B4 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-NEXT: DeclStmt {{.*}} +// CHECK-NEXT: `-VarDecl {{.*}} item 'Item *' +// CHECK-EMPTY: +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'Collection *' +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'Collection *' lvalue ParmVar {{.*}} 'coll' 'Collection *' +// CHECK-EMPTY: +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B5 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-NEXT: DeclStmt {{.*}} +// CHECK-NEXT: `-VarDecl {{.*}} item 'Item *' +// CHECK-EMPTY: +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B0 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK-NEXT: [ B1 (live statements at block exit) ] +// CHECK-EMPTY: +// CHECK-EMPTY: + diff --git a/clang/test/Analysis/report-uniqueing.cpp b/clang/test/Analysis/report-uniqueing.cpp new file mode 100644 index 0000000000000..0e4d50e13a20d --- /dev/null +++ b/clang/test/Analysis/report-uniqueing.cpp @@ -0,0 +1,16 @@ +// RUN: %clang_analyze_cc1 -verify %s \ +// RUN: -analyzer-checker=security + +void bzero(void *, unsigned long); + +template void foo(T l) { + // The warning comes from multiple instances and with + // different declarations that have same source location. + // One instance should be shown. + bzero(l, 1); // expected-warning{{The bzero() function is obsoleted}} +} + +void p(int *p, unsigned *q) { + foo(p); + foo(q); +} diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp index 0d4d34ac0e147..d92356c1ec0b2 100644 --- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp +++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.noreturn/p1.cpp @@ -8,9 +8,10 @@ void a2 [[noreturn]] () { } template void a3 [[noreturn]] () {} -template <> void a3 () { return; } // expected-warning {{function 'a3' declared 'noreturn' should not return}} +template <> void a3() { return; } // expected-warning {{function 'a3' declared 'noreturn' should not return}} -template void a4 [[noreturn]] () { return; } // expected-warning 2{{function 'a4' declared 'noreturn' should not return}} +template void a4 [[noreturn]] () { return; } // expected-warning {{function 'a4' declared 'noreturn' should not return}} + // expected-warning@-1 {{function 'a4' declared 'noreturn' should not return}} void a4_test() { a4(); } // expected-note {{in instantiation of function template specialization 'a4' requested here}} [[noreturn, noreturn]] void b() { throw 0; } // expected-error {{attribute 'noreturn' cannot appear multiple times in an attribute specifier}} diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp index ad827fb7b3142..415d634d5f98c 100644 --- a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp +++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p3.cpp @@ -1,3 +1,7 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s void f(int) { } // expected-note {{previous definition is here}} void f(const int) { } // expected-error {{redefinition of 'f'}} + +template void ft(T) {} +template <> void ft(int) {} // expected-note {{previous definition is here}} +template <> void ft(int) {} // expected-error {{redefinition of 'ft'}} diff --git a/clang/test/CXX/drs/dr23xx.cpp b/clang/test/CXX/drs/dr23xx.cpp index 3268838ac6c85..c265ebbe359cb 100644 --- a/clang/test/CXX/drs/dr23xx.cpp +++ b/clang/test/CXX/drs/dr23xx.cpp @@ -113,3 +113,35 @@ namespace dr2387 { // dr2387: 9 extern template const int d; #endif } + +#if __cplusplus >= 201103L +namespace dr2303 { // dr2303: 12 +template +struct A; +template <> +struct A<> {}; +template +struct A : A {}; +struct B : A {}; +struct C : A, A {}; // expected-warning {{direct base 'A' is inaccessible}} +struct D : A, A {}; // expected-warning {{direct base 'A' is inaccessible}} +struct E : A {}; +struct F : B, E {}; + +template +void f(const A &) { + static_assert(sizeof...(T) == 2, "Should only match A"); +} +template +void f2(const A *); + +void g() { + f(B{}); // This is no longer ambiguous. + B b; + f2(&b); + f(C{}); + f(D{}); + f(F{}); // expected-error {{ambiguous conversion from derived class}} +} +} //namespace dr2303 +#endif diff --git a/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp b/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp index 01c758bef19a5..7c95a3ca88eb3 100644 --- a/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp +++ b/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp @@ -64,6 +64,21 @@ namespace X { }; constexpr int operator*(const C::It &) { return 0; } + + struct D { + D(); + using Ty = int[2]; + Ty *begin(); + Ty *end(); + }; + + void test_D() { +#if __cplusplus >= 201703L + for (extern auto [x, y] : D()) { + } // expected-error@-1 {{decomposition declaration cannot be declared 'extern'}} + // expected-error@-2 {{loop variable '[x, y]' may not be declared 'extern'}} +#endif + } } using X::A; diff --git a/clang/test/CXX/temp/temp.param/p15-cxx0x.cpp b/clang/test/CXX/temp/temp.param/p15-cxx0x.cpp index 667152da1cbcd..63f56640b1ce9 100644 --- a/clang/test/CXX/temp/temp.param/p15-cxx0x.cpp +++ b/clang/test/CXX/temp/temp.param/p15-cxx0x.cpp @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s -template struct X; // expected-note {{'X' is incomplete}} +template struct X; // expected-note {{'X>>' is incomplete}} template struct Y; X> *x1; @@ -14,8 +14,8 @@ typedef X X_int; struct Z : X_int { }; void f(const X x) { - (void)reinterpret_cast>(x); // expected-error{{reinterpret_cast from}} - (void)reinterpret_cast>>>(x); // expected-error{{reinterpret_cast from}} + (void)reinterpret_cast>(x); // expected-error{{reinterpret_cast from 'const X' to 'X' is not allowed}} + (void)reinterpret_cast>>>(x); // expected-error{{reinterpret_cast from 'const X' to 'X>>' is not allowed}} X> *x1; } diff --git a/clang/test/CXX/temp/temp.pre/p6.cpp b/clang/test/CXX/temp/temp.pre/p6.cpp new file mode 100644 index 0000000000000..cb8c70ca3abed --- /dev/null +++ b/clang/test/CXX/temp/temp.pre/p6.cpp @@ -0,0 +1,79 @@ +// RUN: %clang_cc1 -std=c++20 -verify %s + +// Templates and partial and explicit specializations can't have C linkage. +namespace extern_c_templates { + +template struct A { + static int a; + struct b; + void c(); + enum class d; + + template static int e; + template struct f; + template void g(); +}; + +template int B; +template void C(); + +extern "C" { // expected-note 1+{{begins here}} + // templates + template struct A; // expected-error {{templates must have C++ linkage}} + template int B; // expected-error {{templates must have C++ linkage}} + template void C(); // expected-error {{templates must have C++ linkage}} + + // non-template members of a template + // FIXME: Should these really be valid? + template int A::a; + template struct A::b {}; + template void A::c() {} + template enum class A::d {}; + + // templates + template template int A::e; // expected-error {{templates must have C++ linkage}} + template template struct A::f {}; // expected-error {{templates must have C++ linkage}} + template template void A::g() {} // expected-error {{templates must have C++ linkage}} + + // partial specializations + template struct A; // expected-error {{templates must have C++ linkage}} + template int B; // expected-error {{templates must have C++ linkage}} + template template int A::e; // expected-error {{templates must have C++ linkage}} + template template struct A::f {}; // expected-error {{templates must have C++ linkage}} + + // explicit specializations of templates + template<> struct A {}; // expected-error {{templates must have C++ linkage}} + template<> int B; // expected-error {{templates must have C++ linkage}} + template<> void C() {} // expected-error {{templates must have C++ linkage}} + + // explicit specializations of members of a template + template<> int A::a; // expected-error {{templates must have C++ linkage}} + template<> struct A::b {}; // expected-error {{templates must have C++ linkage}} + template<> void A::c() {} // expected-error {{templates must have C++ linkage}} + template<> enum class A::d {}; // expected-error {{templates must have C++ linkage}} + + // explicit specializations of member templates + template<> template int A::e; // expected-error {{templates must have C++ linkage}} + template<> template struct A::f {}; // expected-error {{templates must have C++ linkage}} + template<> template void A::g() {} // expected-error {{templates must have C++ linkage}} +} + +// Provide valid definitions for the explicit instantiations below. +// FIXME: Our recovery from the invalid definitions above isn't very good. +template template int A::e; +template template struct A::f {}; +template template void A::g() {} + +extern "C" { + // explicit instantiations + // FIXME: Should these really be valid? + template struct A; + template int A::a; + template struct A::b; + template void A::c(); + template int A::e; + template struct A::f; + template void A::g(); +} + +} diff --git a/clang/test/CodeGen/Inputs/thinlto_expect1.proftext b/clang/test/CodeGen/Inputs/thinlto_expect1.proftext index e7ce3a4ee2379..0c904e2ea1c89 100644 --- a/clang/test/CodeGen/Inputs/thinlto_expect1.proftext +++ b/clang/test/CodeGen/Inputs/thinlto_expect1.proftext @@ -2,7 +2,7 @@ :ir foo # Func Hash: -25571299074 +784007059655560962 # Num Counters: 2 # Counter Values: diff --git a/clang/test/CodeGen/Inputs/thinlto_expect2.proftext b/clang/test/CodeGen/Inputs/thinlto_expect2.proftext index f9de785587abe..c240a442c465f 100644 --- a/clang/test/CodeGen/Inputs/thinlto_expect2.proftext +++ b/clang/test/CodeGen/Inputs/thinlto_expect2.proftext @@ -2,7 +2,7 @@ :csir foo # Func Hash: -25571299074 +784007059655560962 # Num Counters: 2 # Counter Values: @@ -11,7 +11,7 @@ foo foo # Func Hash: -1152921530178146050 +1936928564262407938 # Num Counters: 2 # Counter Values: diff --git a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c index 929a6501df77f..f06432c914f25 100644 --- a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c +++ b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c @@ -125,28 +125,28 @@ uint64_t test_vcvtpd_u64_f64(float64_t a) { } // CHECK-LABEL: define i32 @test_vcvts_s32_f32(float %a) #0 { -// CHECK: [[TMP0:%.*]] = fptosi float %a to i32 +// CHECK: [[TMP0:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a) // CHECK: ret i32 [[TMP0]] int32_t test_vcvts_s32_f32(float32_t a) { return (int32_t)vcvts_s32_f32(a); } // CHECK-LABEL: define i64 @test_vcvtd_s64_f64(double %a) #0 { -// CHECK: [[TMP0:%.*]] = fptosi double %a to i64 +// CHECK: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %a) // CHECK: ret i64 [[TMP0]] int64_t test_vcvtd_s64_f64(float64_t a) { return (int64_t)vcvtd_s64_f64(a); } // CHECK-LABEL: define i32 @test_vcvts_u32_f32(float %a) #0 { -// CHECK: [[TMP0:%.*]] = fptoui float %a to i32 +// CHECK: [[TMP0:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %a) // CHECK: ret i32 [[TMP0]] uint32_t test_vcvts_u32_f32(float32_t a) { return (uint32_t)vcvts_u32_f32(a); } // CHECK-LABEL: define i64 @test_vcvtd_u64_f64(double %a) #0 { -// CHECK: [[TMP0:%.*]] = fptoui double %a to i64 +// CHECK: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %a) // CHECK: ret i64 [[TMP0]] uint64_t test_vcvtd_u64_f64(float64_t a) { return (uint64_t)vcvtd_u64_f64(a); diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CodeGen/aarch64-neon-intrinsics.c index fbf3c2c5b63e8..de7e770256058 100644 --- a/clang/test/CodeGen/aarch64-neon-intrinsics.c +++ b/clang/test/CodeGen/aarch64-neon-intrinsics.c @@ -18023,7 +18023,7 @@ float64x1_t test_vneg_f64(float64x1_t a) { // CHECK-LABEL: @test_vcvt_s64_f64( // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fptosi <1 x double> %a to <1 x i64> +// CHECK: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %a) // CHECK: ret <1 x i64> [[TMP1]] int64x1_t test_vcvt_s64_f64(float64x1_t a) { return vcvt_s64_f64(a); @@ -18031,7 +18031,7 @@ int64x1_t test_vcvt_s64_f64(float64x1_t a) { // CHECK-LABEL: @test_vcvt_u64_f64( // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fptoui <1 x double> %a to <1 x i64> +// CHECK: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %a) // CHECK: ret <1 x i64> [[TMP1]] uint64x1_t test_vcvt_u64_f64(float64x1_t a) { return vcvt_u64_f64(a); diff --git a/clang/test/CodeGen/aarch64-neon-misc.c b/clang/test/CodeGen/aarch64-neon-misc.c index acc3c0b7d2d4e..14b5a357f61ee 100644 --- a/clang/test/CodeGen/aarch64-neon-misc.c +++ b/clang/test/CodeGen/aarch64-neon-misc.c @@ -2311,7 +2311,7 @@ float64x2_t test_vrndiq_f64(float64x2_t a) { // CHECK-LABEL: @test_vcvt_s32_f32( // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fptosi <2 x float> %a to <2 x i32> +// CHECK: [[TMP1:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> %a) // CHECK: ret <2 x i32> [[TMP1]] int32x2_t test_vcvt_s32_f32(float32x2_t a) { return vcvt_s32_f32(a); @@ -2319,7 +2319,7 @@ int32x2_t test_vcvt_s32_f32(float32x2_t a) { // CHECK-LABEL: @test_vcvtq_s32_f32( // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fptosi <4 x float> %a to <4 x i32> +// CHECK: [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %a) // CHECK: ret <4 x i32> [[TMP1]] int32x4_t test_vcvtq_s32_f32(float32x4_t a) { return vcvtq_s32_f32(a); @@ -2327,7 +2327,7 @@ int32x4_t test_vcvtq_s32_f32(float32x4_t a) { // CHECK-LABEL: @test_vcvtq_s64_f64( // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fptosi <2 x double> %a to <2 x i64> +// CHECK: [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %a) // CHECK: ret <2 x i64> [[TMP1]] int64x2_t test_vcvtq_s64_f64(float64x2_t a) { return vcvtq_s64_f64(a); @@ -2335,7 +2335,7 @@ int64x2_t test_vcvtq_s64_f64(float64x2_t a) { // CHECK-LABEL: @test_vcvt_u32_f32( // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fptoui <2 x float> %a to <2 x i32> +// CHECK: [[TMP1:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> %a) // CHECK: ret <2 x i32> [[TMP1]] uint32x2_t test_vcvt_u32_f32(float32x2_t a) { return vcvt_u32_f32(a); @@ -2343,7 +2343,7 @@ uint32x2_t test_vcvt_u32_f32(float32x2_t a) { // CHECK-LABEL: @test_vcvtq_u32_f32( // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fptoui <4 x float> %a to <4 x i32> +// CHECK: [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %a) // CHECK: ret <4 x i32> [[TMP1]] uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { return vcvtq_u32_f32(a); @@ -2351,7 +2351,7 @@ uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { // CHECK-LABEL: @test_vcvtq_u64_f64( // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fptoui <2 x double> %a to <2 x i64> +// CHECK: [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %a) // CHECK: ret <2 x i64> [[TMP1]] uint64x2_t test_vcvtq_u64_f64(float64x2_t a) { return vcvtq_u64_f64(a); diff --git a/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c index 45b18453b89dd..32161146ef450 100644 --- a/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c +++ b/clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c @@ -97,42 +97,44 @@ float16_t test_vcvth_f16_u64 (uint64_t a) { } // CHECK-LABEL: test_vcvth_s16_f16 -// CHECK: [[VCVT:%.*]] = fptosi half %a to i16 -// CHECK: ret i16 [[VCVT]] +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half %a) +// CHECK: [[TRUNC:%.*]] = trunc i32 [[VCVT]] to i16 +// CHECK: ret i16 [[TRUNC]] int16_t test_vcvth_s16_f16 (float16_t a) { return vcvth_s16_f16(a); } // CHECK-LABEL: test_vcvth_s32_f16 -// CHECK: [[VCVT:%.*]] = fptosi half %a to i32 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half %a) // CHECK: ret i32 [[VCVT]] int32_t test_vcvth_s32_f16 (float16_t a) { return vcvth_s32_f16(a); } // CHECK-LABEL: test_vcvth_s64_f16 -// CHECK: [[VCVT:%.*]] = fptosi half %a to i64 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half %a) // CHECK: ret i64 [[VCVT]] int64_t test_vcvth_s64_f16 (float16_t a) { return vcvth_s64_f16(a); } // CHECK-LABEL: test_vcvth_u16_f16 -// CHECK: [[VCVT:%.*]] = fptoui half %a to i16 -// CHECK: ret i16 [[VCVT]] +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half %a) +// CHECK: [[TRUNC:%.*]] = trunc i32 [[VCVT]] to i16 +// CHECK: ret i16 [[TRUNC]] uint16_t test_vcvth_u16_f16 (float16_t a) { return vcvth_u16_f16(a); } // CHECK-LABEL: test_vcvth_u32_f16 -// CHECK: [[VCVT:%.*]] = fptoui half %a to i32 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half %a) // CHECK: ret i32 [[VCVT]] uint32_t test_vcvth_u32_f16 (float16_t a) { return vcvth_u32_f16(a); } // CHECK-LABEL: test_vcvth_u64_f16 -// CHECK: [[VCVT:%.*]] = fptoui half %a to i64 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f16(half %a) // CHECK: ret i64 [[VCVT]] uint64_t test_vcvth_u64_f16 (float16_t a) { return vcvth_u64_f16(a); diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c index 59b3dfec80cb9..5c4f9053a9aec 100644 --- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -130,28 +130,28 @@ float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) { } // CHECK-LABEL: test_vcvt_s16_f16 -// CHECK: [[VCVT:%.*]] = fptosi <4 x half> %a to <4 x i16> +// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> %a) // CHECK: ret <4 x i16> [[VCVT]] int16x4_t test_vcvt_s16_f16 (float16x4_t a) { return vcvt_s16_f16(a); } // CHECK-LABEL: test_vcvtq_s16_f16 -// CHECK: [[VCVT:%.*]] = fptosi <8 x half> %a to <8 x i16> +// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> %a) // CHECK: ret <8 x i16> [[VCVT]] int16x8_t test_vcvtq_s16_f16 (float16x8_t a) { return vcvtq_s16_f16(a); } // CHECK-LABEL: test_vcvt_u16_f16 -// CHECK: [[VCVT:%.*]] = fptoui <4 x half> %a to <4 x i16> +// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> %a) // CHECK: ret <4 x i16> [[VCVT]] uint16x4_t test_vcvt_u16_f16 (float16x4_t a) { return vcvt_u16_f16(a); } // CHECK-LABEL: test_vcvtq_u16_f16 -// CHECK: [[VCVT:%.*]] = fptoui <8 x half> %a to <8 x i16> +// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %a) // CHECK: ret <8 x i16> [[VCVT]] uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) { return vcvtq_u16_f16(a); diff --git a/clang/test/CodeGen/arm-bf16-softfloat.c b/clang/test/CodeGen/arm-bf16-softfloat.c index 5ea7319ec50ba..1792a1e3c9ceb 100644 --- a/clang/test/CodeGen/arm-bf16-softfloat.c +++ b/clang/test/CodeGen/arm-bf16-softfloat.c @@ -1,4 +1,10 @@ -// RUN: not %clang -o %t.out -target arm-arm-eabi -march=armv8-a+bf16 -mfloat-abi=soft -c %s 2>&1 | FileCheck %s +// REQUIRES: arm-registered-target +// RUN: not %clang -target arm-arm-none-eabi -march=armv8-a+bf16 -mfloat-abi=soft -c %s -o %t 2>&1 | FileCheck %s +// RUN: not %clang -target arm-arm-none-eabi -march=armv8-a+bf16 -mfpu=none -c %s -o %t 2>&1 | FileCheck %s +// RUN: not %clang -target arm-arm-none-eabi -march=armv8-a+bf16+nofp -c %s -o %t 2>&1 | FileCheck %s +// RUN: not %clang -target arm-arm-none-eabi -march=armv8-a+bf16+fp+nofp -c %s -o %t 2>&1 | FileCheck %s +// RUN: %clang -target arm-arm-none-eabi -march=armv8-a+bf16+fp -c %s -o %t +// RUN: %clang -target arm-arm-none-eabi -march=armv8-a+bf16+nofp+fp -c %s -o %t // CHECK: error: __bf16 is not supported on this target extern __bf16 var; diff --git a/clang/test/CodeGen/builtins-ppc-error.c b/clang/test/CodeGen/builtins-ppc-error.c index 80ca227eeb24c..6557565938b0c 100644 --- a/clang/test/CodeGen/builtins-ppc-error.c +++ b/clang/test/CodeGen/builtins-ppc-error.c @@ -32,16 +32,16 @@ void testInsertWord(void) { } void testXXPERMDI(int index) { - vec_xxpermdi(vsi); //expected-error {{too few arguments to function call, expected at least 3, have 1}} - vec_xxpermdi(vsi, vsi, 2, 4); //expected-error {{too many arguments to function call, expected at most 3, have 4}} + vec_xxpermdi(vsi); //expected-error {{too few arguments to function call, expected 3, have 1}} + vec_xxpermdi(vsi, vsi, 2, 4); //expected-error {{too many arguments to function call, expected 3, have 4}} vec_xxpermdi(vsi, vsi, index); //expected-error {{argument 3 to '__builtin_vsx_xxpermdi' must be a 2-bit unsigned literal (i.e. 0, 1, 2 or 3)}} vec_xxpermdi(1, 2, 3); //expected-error {{first two arguments to '__builtin_vsx_xxpermdi' must be vectors}} vec_xxpermdi(vsi, vuc, 2); //expected-error {{first two arguments to '__builtin_vsx_xxpermdi' must have the same type}} } void testXXSLDWI(int index) { - vec_xxsldwi(vsi); //expected-error {{too few arguments to function call, expected at least 3, have 1}} - vec_xxsldwi(vsi, vsi, 2, 4); //expected-error {{too many arguments to function call, expected at most 3, have 4}} + vec_xxsldwi(vsi); //expected-error {{too few arguments to function call, expected 3, have 1}} + vec_xxsldwi(vsi, vsi, 2, 4); //expected-error {{too many arguments to function call, expected 3, have 4}} vec_xxsldwi(vsi, vsi, index); //expected-error {{argument 3 to '__builtin_vsx_xxsldwi' must be a 2-bit unsigned literal (i.e. 0, 1, 2 or 3)}} vec_xxsldwi(1, 2, 3); //expected-error {{first two arguments to '__builtin_vsx_xxsldwi' must be vectors}} vec_xxsldwi(vsi, vuc, 2); //expected-error {{first two arguments to '__builtin_vsx_xxsldwi' must have the same type}} diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c index 6f38ac77ee242..571d33d34a220 100644 --- a/clang/test/CodeGen/builtins-ppc-p10vector.c +++ b/clang/test/CodeGen/builtins-ppc-p10vector.c @@ -1,15 +1,11 @@ // REQUIRES: powerpc-registered-target -// RUN: %clang_cc1 -target-feature +vsx -target-feature +altivec \ -// RUN: -target-cpu pwr10 -triple powerpc64le-unknown-unknown -emit-llvm %s \ -// RUN: -o - | FileCheck %s - -// RUN: %clang_cc1 -target-feature +vsx -target-feature +altivec \ +// RUN: %clang_cc1 -target-feature +vsx \ // RUN: -target-cpu pwr10 -triple powerpc64-unknown-unknown -emit-llvm %s \ -// RUN: -o - | FileCheck %s -check-prefix=CHECK-BE +// RUN: -o - | FileCheck %s -check-prefixes=CHECK-BE,CHECK -// RUN: %clang_cc1 -target-feature +vsx -target-feature +altivec \ +// RUN: %clang_cc1 -target-feature +vsx \ // RUN: -target-cpu pwr10 -triple powerpc64le-unknown-unknown -emit-llvm %s \ -// RUN: -o - | FileCheck %s -check-prefix=CHECK-LE +// RUN: -o - | FileCheck %s -check-prefixes=CHECK-LE,CHECK #include @@ -29,6 +25,66 @@ unsigned char uca; unsigned short usa; unsigned long long ulla; +vector signed long long test_vec_mul_sll(void) { + // CHECK: mul <2 x i64> + // CHECK-NEXT: ret <2 x i64> + return vec_mul(vslla, vsllb); +} + +vector unsigned long long test_vec_mul_ull(void) { + // CHECK: mul <2 x i64> + // CHECK-NEXT: ret <2 x i64> + return vec_mul(vulla, vullb); +} + +vector signed int test_vec_div_si(void) { + // CHECK: sdiv <4 x i32> + // CHECK-NEXT: ret <4 x i32> + return vec_div(vsia, vsib); +} + +vector unsigned int test_vec_div_ui(void) { + // CHECK: udiv <4 x i32> + // CHECK-NEXT: ret <4 x i32> + return vec_div(vuia, vuib); +} + +vector signed long long test_vec_div_sll(void) { + // CHECK: sdiv <2 x i64> + // CHECK-NEXT: ret <2 x i64> + return vec_div(vslla, vsllb); +} + +vector unsigned long long test_vec_div_ull(void) { + // CHECK: udiv <2 x i64> + // CHECK-NEXT: ret <2 x i64> + return vec_div(vulla, vullb); +} + +vector signed int test_vec_mod_si(void) { + // CHECK: srem <4 x i32> + // CHECK-NEXT: ret <4 x i32> + return vec_mod(vsia, vsib); +} + +vector unsigned int test_vec_mod_ui(void) { + // CHECK: urem <4 x i32> + // CHECK-NEXT: ret <4 x i32> + return vec_mod(vuia, vuib); +} + +vector signed long long test_vec_mod_sll(void) { + // CHECK: srem <2 x i64> + // CHECK-NEXT: ret <2 x i64> + return vec_mod(vslla, vsllb); +} + +vector unsigned long long test_vec_mod_ull(void) { + // CHECK: urem <2 x i64> + // CHECK-NEXT: ret <2 x i64> + return vec_mod(vulla, vullb); +} + vector unsigned long long test_vpdepd(void) { // CHECK: @llvm.ppc.altivec.vpdepd(<2 x i64> // CHECK-NEXT: ret <2 x i64> @@ -514,19 +570,16 @@ vector unsigned int test_vec_inserth_uiv(void) { } vector signed int test_vec_vec_splati_si(void) { - // CHECK-BE: ret <4 x i32> // CHECK: ret <4 x i32> return vec_splati(-17); } vector unsigned int test_vec_vec_splati_ui(void) { - // CHECK-BE: ret <4 x i32> // CHECK: ret <4 x i32> return vec_splati(16U); } vector float test_vec_vec_splati_f(void) { - // CHECK-BE: ret <4 x float> // CHECK: ret <4 x float> return vec_splati(1.0f); } @@ -536,10 +589,10 @@ vector double test_vec_vec_splatid(void) { // CHECK-BE-NEXT: [[T2:%.+]] = insertelement <2 x double> undef, double [[T1:%.+]], i32 0 // CHECK-BE-NEXT: [[T3:%.+]] = shufflevector <2 x double> [[T2:%.+]], <2 x double> undef, <2 x i32> zeroinitialize // CHECK-BE-NEXT: ret <2 x double> [[T3:%.+]] - // CHECK: [[T1:%.+]] = fpext float %{{.+}} to double - // CHECK-NEXT: [[T2:%.+]] = insertelement <2 x double> undef, double [[T1:%.+]], i32 0 - // CHECK-NEXT: [[T3:%.+]] = shufflevector <2 x double> [[T2:%.+]], <2 x double> undef, <2 x i32> zeroinitialize - // CHECK-NEXT: ret <2 x double> [[T3:%.+]] + // CHECK-LE: [[T1:%.+]] = fpext float %{{.+}} to double + // CHECK-LE-NEXT: [[T2:%.+]] = insertelement <2 x double> undef, double [[T1:%.+]], i32 0 + // CHECK-LE-NEXT: [[T3:%.+]] = shufflevector <2 x double> [[T2:%.+]], <2 x double> undef, <2 x i32> zeroinitialize + // CHECK-LE-NEXT: ret <2 x double> [[T3:%.+]] return vec_splatid(1.0); } @@ -548,11 +601,11 @@ vector signed int test_vec_vec_splati_ins_si(void) { // CHECK-BE: [[T1:%.+]] = add i32 2, %{{.+}} // CHECK-BE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] // CHECK-BE: ret <4 x i32> - // CHECK: [[T1:%.+]] = sub i32 1, %{{.+}} - // CHECK: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] - // CHECK: [[T2:%.+]] = sub i32 3, %{{.+}} - // CHECK: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T2]] - // CHECK: ret <4 x i32> + // CHECK-LE: [[T1:%.+]] = sub i32 1, %{{.+}} + // CHECK-LE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] + // CHECK-LE: [[T2:%.+]] = sub i32 3, %{{.+}} + // CHECK-LE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T2]] + // CHECK-LE: ret <4 x i32> return vec_splati_ins(vsia, 0, -17); } @@ -561,11 +614,11 @@ vector unsigned int test_vec_vec_splati_ins_ui(void) { // CHECK-BE: [[T1:%.+]] = add i32 2, %{{.+}} // CHECK-BE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] // CHECK-BE: ret <4 x i32> - // CHECK: [[T1:%.+]] = sub i32 1, %{{.+}} - // CHECK: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] - // CHECK: [[T2:%.+]] = sub i32 3, %{{.+}} - // CHECK: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T2]] - // CHECK: ret <4 x i32> + // CHECK-LE: [[T1:%.+]] = sub i32 1, %{{.+}} + // CHECK-LE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T1]] + // CHECK-LE: [[T2:%.+]] = sub i32 3, %{{.+}} + // CHECK-LE: insertelement <4 x i32> %{{.+}}, i32 %{{.+}}, i32 [[T2]] + // CHECK-LE: ret <4 x i32> return vec_splati_ins(vuia, 1, 16U); } @@ -574,14 +627,62 @@ vector float test_vec_vec_splati_ins_f(void) { // CHECK-BE: [[T1:%.+]] = add i32 2, %{{.+}} // CHECK-BE: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T1]] // CHECK-BE: ret <4 x float> - // CHECK: [[T1:%.+]] = sub i32 1, %{{.+}} - // CHECK: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T1]] - // CHECK: [[T2:%.+]] = sub i32 3, %{{.+}} - // CHECK: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T2]] - // CHECK: ret <4 x float> + // CHECK-LE: [[T1:%.+]] = sub i32 1, %{{.+}} + // CHECK-LE: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T1]] + // CHECK-LE: [[T2:%.+]] = sub i32 3, %{{.+}} + // CHECK-LE: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 [[T2]] + // CHECK-LE: ret <4 x float> return vec_splati_ins(vfa, 0, 1.0f); } +void test_vec_xst_trunc_sc(vector signed __int128 __a, signed long long __b, + signed char *__c) { + // CHECK: store i8 %{{.+}}, i8* %{{.+}}, align 1 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_uc(vector unsigned __int128 __a, signed long long __b, + unsigned char *__c) { + // CHECK: store i8 %{{.+}}, i8* %{{.+}}, align 1 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_ss(vector signed __int128 __a, signed long long __b, + signed short *__c) { + // CHECK: store i16 %{{.+}}, i16* %{{.+}}, align 2 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_us(vector unsigned __int128 __a, signed long long __b, + unsigned short *__c) { + // CHECK: store i16 %{{.+}}, i16* %{{.+}}, align 2 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_si(vector signed __int128 __a, signed long long __b, + signed int *__c) { + // CHECK: store i32 %{{.+}}, i32* %{{.+}}, align 4 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_ui(vector unsigned __int128 __a, signed long long __b, + unsigned int *__c) { + // CHECK: store i32 %{{.+}}, i32* %{{.+}}, align 4 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_sll(vector signed __int128 __a, signed long long __b, + signed long long *__c) { + // CHECK: store i64 %{{.+}}, i64* %{{.+}}, align 8 + vec_xst_trunc(__a, __b, __c); +} + +void test_vec_xst_trunc_ull(vector unsigned __int128 __a, signed long long __b, + unsigned long long *__c) { + // CHECK: store i64 %{{.+}}, i64* %{{.+}}, align 8 + vec_xst_trunc(__a, __b, __c); +} + int test_vec_test_lsbb_all_ones(void) { // CHECK: @llvm.ppc.vsx.xvtlsbb(<16 x i8> %{{.+}}, i32 1 // CHECK-NEXT: ret i32 diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index f7e3dc1ea5e7a..14e0d0ac65ed2 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -737,51 +737,15 @@ i16x8 narrow_u_i16x8_i32x4(i32x4 low, i32x4 high) { // WEBASSEMBLY: ret } -i16x8 widen_low_s_i16x8_i8x16(i8x16 v) { - return __builtin_wasm_widen_low_s_i16x8_i8x16(v); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.widen.low.signed.v8i16.v16i8(<16 x i8> %v) +i32x4 load32_zero(int *p) { + return __builtin_wasm_load32_zero(p); + // WEBASSEMBLY: call <4 x i32> @llvm.wasm.load32.zero(i32* %p) // WEBASSEMBLY: ret } -i16x8 widen_high_s_i16x8_i8x16(i8x16 v) { - return __builtin_wasm_widen_high_s_i16x8_i8x16(v); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.widen.high.signed.v8i16.v16i8(<16 x i8> %v) - // WEBASSEMBLY: ret -} - -i16x8 widen_low_u_i16x8_i8x16(i8x16 v) { - return __builtin_wasm_widen_low_u_i16x8_i8x16(v); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.widen.low.unsigned.v8i16.v16i8(<16 x i8> %v) - // WEBASSEMBLY: ret -} - -i16x8 widen_high_u_i16x8_i8x16(i8x16 v) { - return __builtin_wasm_widen_high_u_i16x8_i8x16(v); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.widen.high.unsigned.v8i16.v16i8(<16 x i8> %v) - // WEBASSEMBLY: ret -} - -i32x4 widen_low_s_i32x4_i16x8(i16x8 v) { - return __builtin_wasm_widen_low_s_i32x4_i16x8(v); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.low.signed.v4i32.v8i16(<8 x i16> %v) - // WEBASSEMBLY: ret -} - -i32x4 widen_high_s_i32x4_i16x8(i16x8 v) { - return __builtin_wasm_widen_high_s_i32x4_i16x8(v); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.high.signed.v4i32.v8i16(<8 x i16> %v) - // WEBASSEMBLY: ret -} - -i32x4 widen_low_u_i32x4_i16x8(i16x8 v) { - return __builtin_wasm_widen_low_u_i32x4_i16x8(v); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.low.unsigned.v4i32.v8i16(<8 x i16> %v) - // WEBASSEMBLY: ret -} - -i32x4 widen_high_u_i32x4_i16x8(i16x8 v) { - return __builtin_wasm_widen_high_u_i32x4_i16x8(v); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.high.unsigned.v4i32.v8i16(<8 x i16> %v) +i64x2 load64_zero(long long *p) { + return __builtin_wasm_load64_zero(p); + // WEBASSEMBLY: call <2 x i64> @llvm.wasm.load64.zero(i64* %p) // WEBASSEMBLY: ret } diff --git a/clang/test/CodeGen/callback_annotated.c b/clang/test/CodeGen/callback_annotated.c index c5b431d5ef845..83a79c3491daf 100644 --- a/clang/test/CodeGen/callback_annotated.c +++ b/clang/test/CodeGen/callback_annotated.c @@ -1,6 +1,4 @@ -// RUN: %clang_cc1 -triple i386-unknown-unknown -fopenmp -O1 -fno-experimental-new-pass-manager %s -emit-llvm -o - | FileCheck %s --check-prefix=RUN1 -// RUN: %clang_cc1 -triple i386-unknown-unknown -fopenmp -O1 -fno-experimental-new-pass-manager %s -emit-llvm -o - | FileCheck %s --check-prefix=RUN2 -// RUN: %clang_cc1 -triple i386-unknown-unknown -fopenmp -O1 -fno-experimental-new-pass-manager %s -emit-llvm -o - | opt -ipconstprop -S | FileCheck --check-prefix=IPCP %s +// RUN: %clang_cc1 -triple i386-unknown-unknown -fopenmp -fno-experimental-new-pass-manager %s -emit-llvm -o - -disable-llvm-optzns | FileCheck %s --check-prefix=RUN1 // RUN1-DAG: @broker0({{[^#]*#[0-9]+}} !callback ![[cid0:[0-9]+]] __attribute__((callback(1, 2))) void *broker0(void *(*callee)(void *), void *payload) { @@ -29,22 +27,10 @@ __attribute__((callback(4, -1, a, __))) void *broker4(int a, int, int, int (*cal __attribute__((callback(4, d, 5, 2))) void *broker5(int, int, int, int (*callee)(int, int, int), int d); static void *VoidPtr2VoidPtr(void *payload) { - // RUN2: ret i8* %payload - // IPCP: ret i8* null return payload; } static int ThreeInt2Int(int a, int b, int c) { - // RUN2: define internal i32 @ThreeInt2Int(i32 %a, i32 %b, i32 %c) - // RUN2: %mul = mul nsw i32 %b, %a - // RUN2: %add = add nsw i32 %mul, %c - // RUN2: ret i32 %add - - // IPCP: define internal i32 @ThreeInt2Int(i32 %a, i32 %b, i32 %c) - // IPCP: %mul = mul nsw i32 4, %a - // IPCP: %add = add nsw i32 %mul, %c - // IPCP: ret i32 %add - return a * b + c; } diff --git a/clang/test/CodeGen/callback_openmp.c b/clang/test/CodeGen/callback_openmp.c index 2fc9dcd391f63..90e63fdb2e580 100644 --- a/clang/test/CodeGen/callback_openmp.c +++ b/clang/test/CodeGen/callback_openmp.c @@ -1,5 +1,4 @@ -// RUN: %clang_cc1 -triple i386-unknown-unknown -fopenmp -O1 %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -triple i386-unknown-unknown -fopenmp -O1 %s -emit-llvm -o - | opt -ipconstprop -S | FileCheck --check-prefix=IPCP %s +// RUN: %clang_cc1 -triple i386-unknown-unknown -fopenmp %s -emit-llvm -o - -disable-llvm-optzns | FileCheck %s // CHECK: declare !callback ![[cid:[0-9]+]] void @__kmpc_fork_call // CHECK: declare !callback ![[cid]] void @__kmpc_fork_teams @@ -15,14 +14,11 @@ void foo(int q) { #pragma omp parallel firstprivate(q, p) work1(p, q); -// IPCP: call void @work1(i32 2, i32 %{{[._a-zA-Z0-9]*}}) #pragma omp parallel for firstprivate(p, q) for (int i = 0; i < q; i++) work2(i, p); -// IPCP: call void @work2(i32 %{{[._a-zA-Z0-9]*}}, i32 2) #pragma omp target teams firstprivate(p) work12(p, p); -// IPCP: call void @work12(i32 2, i32 2) } diff --git a/clang/test/CodeGen/callback_pthread_create.c b/clang/test/CodeGen/callback_pthread_create.c index 785440030b32e..d1b01b91eac3f 100644 --- a/clang/test/CodeGen/callback_pthread_create.c +++ b/clang/test/CodeGen/callback_pthread_create.c @@ -1,5 +1,4 @@ -// RUN: %clang_cc1 -O1 %s -S -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -O1 %s -S -emit-llvm -o - | opt -ipconstprop -S | FileCheck --check-prefix=IPCP %s +// RUN: %clang_cc1 %s -S -emit-llvm -o - -disable-llvm-optzns | FileCheck %s // CHECK: declare !callback ![[cid:[0-9]+]] {{.*}}i32 @pthread_create // CHECK: ![[cid]] = !{![[cidb:[0-9]+]]} @@ -21,14 +20,10 @@ int pthread_create(pthread_t *, const pthread_attr_t *, const int GlobalVar = 0; static void *callee0(void *payload) { -// IPCP: define internal i8* @callee0 -// IPCP: ret i8* null return payload; } static void *callee1(void *payload) { -// IPCP: define internal i8* @callee1 -// IPCP: ret i8* bitcast (i32* @GlobalVar to i8*) return payload; } diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll index 6269fd677ca82..9f9a8bec4ef5d 100644 --- a/clang/test/CodeGen/thinlto-distributed-newpm.ll +++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll @@ -1,3 +1,4 @@ +; FIXME: This test should use CHECK-NEXT to keep up-to-date. ; REQUIRES: x86-registered-target ; Validate ThinLTO post link pipeline at O2 and O3 @@ -18,23 +19,18 @@ ; RUN: -c -fthinlto-index=%t.o.thinlto.bc \ ; RUN: -o %t.native.o -x ir %t.o 2>&1 | FileCheck -check-prefixes=CHECK-O,CHECK-O3 %s --dump-input=fail -; CHECK-O: Running analysis: PassInstrumentationAnalysis ; CHECK-O: Starting {{.*}}Module pass manager run. ; CHECK-O: Running pass: WholeProgramDevirtPass ; CHECK-O: Running analysis: InnerAnalysisManagerProxy ; CHECK-O: Running pass: LowerTypeTestsPass -; CHECK-O: Invalidating all non-preserved analyses for: ; CHECK-O: Invalidating analysis: InnerAnalysisManagerProxy ; CHECK-O: Running pass: ForceFunctionAttrsPass -; CHECK-O: Running pass: PassManager<{{.*}}Module> ; CHECK-O: Starting {{.*}}Module pass manager run. ; CHECK-O: Running pass: PGOIndirectCallPromotion ; CHECK-O: Running analysis: ProfileSummaryAnalysis ; CHECK-O: Running analysis: InnerAnalysisManagerProxy ; CHECK-O: Running analysis: OptimizationRemarkEmitterAnalysis on main -; CHECK-O: Running analysis: PassInstrumentationAnalysis on main ; CHECK-O: Running pass: InferFunctionAttrsPass -; CHECK-O: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager<{{.*}}Function>{{ ?}}> ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: SimplifyCFGPass on main ; CHECK-O: Running analysis: TargetIRAnalysis on main @@ -46,18 +42,16 @@ ; CHECK-O: Running pass: LowerExpectIntrinsicPass on main ; CHECK-O3: Running pass: CallSiteSplittingPass on main ; CHECK-O: Finished {{.*}}Function pass manager run. +; CHECK-O: Running pass: LowerTypeTestsPass ; CHECK-O: Running pass: IPSCCPPass ; CHECK-O: Running pass: CalledValuePropagationPass ; CHECK-O: Running pass: GlobalOptPass -; CHECK-O: Invalidating all non-preserved analyses for: ; CHECK-O: Invalidating analysis: InnerAnalysisManagerProxy -; CHECK-O: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PromotePass> ; CHECK-O: Running analysis: InnerAnalysisManagerProxy +; CHECK-O: Running pass: PromotePass ; CHECK-O: Running analysis: DominatorTreeAnalysis on main -; CHECK-O: Running analysis: PassInstrumentationAnalysis on main ; CHECK-O: Running analysis: AssumptionAnalysis on main ; CHECK-O: Running pass: DeadArgumentEliminationPass -; CHECK-O: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager<{{.*}}Function>{{ ?}}> ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: InstCombinePass on main ; CHECK-O: Running analysis: TargetLibraryAnalysis on main @@ -74,21 +68,16 @@ ; CHECK-O: Running analysis: GlobalsAA ; CHECK-O: Running analysis: CallGraphAnalysis ; CHECK-O: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis -; CHECK-O: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}DevirtSCCRepeatedPass<{{.*}}PassManager<{{.*}}LazyCallGraph::SCC ; CHECK-O: Running analysis: InnerAnalysisManagerProxy ; CHECK-O: Running analysis: LazyCallGraphAnalysis ; CHECK-O: Running analysis: FunctionAnalysisManagerCGSCCProxy on (main) -; CHECK-O: Running analysis: PassInstrumentationAnalysis on (main) ; CHECK-O: Running analysis: OuterAnalysisManagerProxy ; CHECK-O: Starting CGSCC pass manager run. ; CHECK-O: Running pass: InlinerPass on (main) ; CHECK-O: Running pass: PostOrderFunctionAttrsPass on (main) -; CHECK-O: Invalidating all non-preserved analyses for: (main) ; CHECK-O: Clearing all analysis results for: main ; CHECK-O3: Running pass: ArgumentPromotionPass on (main) ; CHECK-O3: Running analysis: TargetIRAnalysis on main -; CHECK-O: Running analysis: PassInstrumentationAnalysis on main -; CHECK-O3: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: SROA on main ; These next two can appear in any order since they are accessed as parameters @@ -108,6 +97,7 @@ ; CHECK-O: Running pass: JumpThreadingPass on main ; CHECK-O: Running analysis: LazyValueAnalysis on main ; CHECK-O: Running pass: CorrelatedValuePropagationPass on main +; CHECK-O: Invalidating analysis: LazyValueAnalysis on main ; CHECK-O: Running pass: SimplifyCFGPass on main ; CHECK-O3: Running pass: AggressiveInstCombinePass on main ; CHECK-O: Running pass: InstCombinePass on main @@ -117,7 +107,6 @@ ; CHECK-O: Running pass: SimplifyCFGPass on main ; CHECK-O: Running pass: ReassociatePass on main ; CHECK-O: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis -; CHECK-O: Running pass: FunctionToLoopPassAdaptor<{{.*}}PassManager<{{.*}}Loop ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: LoopSimplifyPass on main ; CHECK-O: Running analysis: LoopAnalysis on main @@ -125,7 +114,6 @@ ; CHECK-O: Finished {{.*}}Function pass manager run. ; CHECK-O: Running pass: SimplifyCFGPass on main ; CHECK-O: Running pass: InstCombinePass on main -; CHECK-O: Running pass: FunctionToLoopPassAdaptor<{{.*}}PassManager<{{.*}}Loop ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: LoopSimplifyPass on main ; CHECK-O: Running pass: LCSSAPass on main @@ -143,7 +131,6 @@ ; CHECK-O: Running pass: JumpThreadingPass on main ; CHECK-O: Running pass: CorrelatedValuePropagationPass on main ; CHECK-O: Running pass: DSEPass on main -; CHECK-O: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass> on main ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: LoopSimplifyPass on main ; CHECK-O: Running pass: LCSSAPass on main @@ -154,13 +141,10 @@ ; CHECK-O: Running pass: InstCombinePass on main ; CHECK-O: Finished {{.*}}Function pass manager run. ; CHECK-O: Finished CGSCC pass manager run. -; CHECK-O: Invalidating all non-preserved analyses for: -; CHECK-O: Invalidating all non-preserved analyses for: main ; CHECK-O: Invalidating analysis: DominatorTreeAnalysis on main ; CHECK-O: Invalidating analysis: BasicAA on main ; CHECK-O: Invalidating analysis: AAManager on main ; CHECK-O: Invalidating analysis: MemorySSAAnalysis on main -; CHECK-O: Invalidating analysis: LazyValueAnalysis on main ; CHECK-O: Invalidating analysis: LoopAnalysis on main ; CHECK-O: Invalidating analysis: PhiValuesAnalysis on main ; CHECK-O: Invalidating analysis: MemoryDependenceAnalysis on main @@ -168,7 +152,6 @@ ; CHECK-O: Invalidating analysis: PostDominatorTreeAnalysis on main ; CHECK-O: Invalidating analysis: CallGraphAnalysis ; CHECK-O: Finished {{.*}}Module pass manager run. -; CHECK-O: Running pass: PassManager<{{.*}}Module> ; CHECK-O: Starting {{.*}}Module pass manager run. ; CHECK-O: Running pass: GlobalOptPass ; CHECK-O: Running pass: GlobalDCEPass @@ -176,40 +159,43 @@ ; CHECK-O: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O: Running analysis: CallGraphAnalysis ; CHECK-O: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA -; CHECK-O: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager<{{.*}}Function>{{ ?}}> ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: Float2IntPass on main ; CHECK-O: Running pass: LowerConstantIntrinsicsPass on main -; CHECK-O: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass> on main ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: LoopSimplifyPass on main ; CHECK-O: Running analysis: LoopAnalysis on main ; CHECK-O: Running pass: LCSSAPass on main ; CHECK-O: Finished {{.*}}Function pass manager run. -; CHECK-O: Running pass: LoopDistributePass on main -; CHECK-O: Running analysis: ScalarEvolutionAnalysis on main +; CHECK-O: Running analysis: MemorySSAAnalysis on main ; CHECK-O: Running analysis: AAManager on main ; CHECK-O: Running analysis: BasicAA on main +; CHECK-O: Running analysis: ScalarEvolutionAnalysis on main ; CHECK-O: Running analysis: InnerAnalysisManagerProxy +; CHECK-O: Running pass: LoopRotatePass on Loop at depth 1 containing: %b +; CHECK-O: Running pass: LoopDistributePass on main +; CHECK-O: Running pass: InjectTLIMappings on main ; CHECK-O: Running pass: LoopVectorizePass on main ; CHECK-O: Running analysis: BlockFrequencyAnalysis on main ; CHECK-O: Running analysis: BranchProbabilityAnalysis on main +; CHECK-O: Running analysis: PostDominatorTreeAnalysis on main ; CHECK-O: Running analysis: DemandedBitsAnalysis on main -; CHECK-O: Running analysis: MemorySSAAnalysis on main ; CHECK-O: Running pass: LoopLoadEliminationPass on main +; CHECK-O: Running analysis: LoopAccessAnalysis on Loop at depth 1 containing: %b ; CHECK-O: Running pass: InstCombinePass on main ; CHECK-O: Running pass: SimplifyCFGPass on main ; CHECK-O: Running pass: SLPVectorizerPass on main +; CHECK-O: Running pass: VectorCombinePass on main ; CHECK-O: Running pass: InstCombinePass on main ; CHECK-O: Running pass: LoopUnrollPass on main ; CHECK-O: Running pass: WarnMissedTransformationsPass on main ; CHECK-O: Running pass: InstCombinePass on main ; CHECK-O: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis -; CHECK-O: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass> on main ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: LoopSimplifyPass on main ; CHECK-O: Running pass: LCSSAPass on main ; CHECK-O: Finished {{.*}}Function pass manager run. +; CHECK-O: Running pass: LICMPass on Loop at depth 1 containing: %b ; CHECK-O: Running pass: AlignmentFromAssumptionsPass on main ; CHECK-O: Running pass: LoopSinkPass on main ; CHECK-O: Running pass: InstSimplifyPass on main @@ -227,6 +213,8 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 target triple = "x86_64-grtev4-linux-gnu" define i32 @main() { -entry: + br label %b +b: + br label %b ret i32 0 } diff --git a/clang/test/CodeGen/ve-abi.c b/clang/test/CodeGen/ve-abi.c index aa35095d5dea0..1c230cb616a38 100644 --- a/clang/test/CodeGen/ve-abi.c +++ b/clang/test/CodeGen/ve-abi.c @@ -1,14 +1,96 @@ +/// Check that ABI is correctly implemented. +/// +/// 1. Check that all integer arguments and return values less than 64 bits +/// are sign/zero extended. +/// 2. Check that all complex arguments and return values are placed in +/// registers if it is possible. Not treat it as aggregate. +/// 3. Check that a function declared without argument type declarations is +/// treated as VARARGS (in order to place arguments in both registers and +/// memory locations in the back end) + // RUN: %clang_cc1 -triple ve-linux-gnu -emit-llvm %s -o - | FileCheck %s -// CHECK-LABEL: define { float, float } @p(float %a.coerce0, float %a.coerce1, float %b.coerce0, float %b.coerce1) #0 { -float __complex__ p(float __complex__ a, float __complex__ b) { +// CHECK-LABEL: define signext i8 @fun_si8(i8 signext %a, i8 signext %b) #0 { +char fun_si8(char a, char b) { + return a; +} + +// CHECK-LABEL: define zeroext i8 @fun_zi8(i8 zeroext %a, i8 zeroext %b) #0 { +unsigned char fun_zi8(unsigned char a, unsigned char b) { + return a; +} + +// CHECK-LABEL: define signext i16 @fun_si16(i16 signext %a, i16 signext %b) #0 { +short fun_si16(short a, short b) { + return a; +} + +// CHECK-LABEL: define zeroext i16 @fun_zi16(i16 zeroext %a, i16 zeroext %b) #0 { +unsigned short fun_zi16(unsigned short a, unsigned short b) { + return a; +} + +// CHECK-LABEL: define signext i32 @fun_si32(i32 signext %a, i32 signext %b) #0 { +int fun_si32(int a, int b) { + return a; +} + +// CHECK-LABEL: define zeroext i32 @fun_zi32(i32 zeroext %a, i32 zeroext %b) #0 { +unsigned int fun_zi32(unsigned int a, unsigned int b) { + return a; +} + +// CHECK-LABEL: define i64 @fun_si64(i64 %a, i64 %b) #0 { +long fun_si64(long a, long b) { + return a; +} + +// CHECK-LABEL: define i64 @fun_zi64(i64 %a, i64 %b) #0 { +unsigned long fun_zi64(unsigned long a, unsigned long b) { + return a; +} + +// CHECK-LABEL: define i128 @fun_si128(i128 %a, i128 %b) #0 { +__int128 fun_si128(__int128 a, __int128 b) { +} + +// CHECK-LABEL: define i128 @fun_zi128(i128 %a, i128 %b) #0 { +unsigned __int128 fun_zi128(unsigned __int128 a, unsigned __int128 b) { + return a; +} + +// CHECK-LABEL: define float @fun_float(float %a, float %b) #0 { +float fun_float(float a, float b) { + return a; +} + +// CHECK-LABEL: define double @fun_double(double %a, double %b) #0 { +double fun_double(double a, double b) { + return a; +} + +// CHECK-LABEL: define fp128 @fun_quad(fp128 %a, fp128 %b) #0 { +long double fun_quad(long double a, long double b) { + return a; +} + +// CHECK-LABEL: define { float, float } @fun_fcomplex(float %a.coerce0, float %a.coerce1, float %b.coerce0, float %b.coerce1) #0 { +float __complex__ fun_fcomplex(float __complex__ a, float __complex__ b) { + return a; +} + +// CHECK-LABEL: define { double, double } @fun_dcomplex(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) #0 { +double __complex__ fun_dcomplex(double __complex__ a, double __complex__ b) { + return a; } -// CHECK-LABEL: define { double, double } @q(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) #0 { -double __complex__ q(double __complex__ a, double __complex__ b) { +// CHECK-LABEL: define { fp128, fp128 } @fun_qcomplex(fp128 %a.coerce0, fp128 %a.coerce1, fp128 %b.coerce0, fp128 %b.coerce1) #0 { +long double __complex__ fun_qcomplex(long double __complex__ a, long double __complex__ b) { + return a; } +extern int hoge(); void func() { - // CHECK-LABEL: %call = call i32 (i32, i32, i32, i32, i32, i32, i32, ...) bitcast (i32 (...)* @hoge to i32 (i32, i32, i32, i32, i32, i32, i32, ...)*)(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7) + // CHECK: %call = call signext i32 (i32, i32, i32, i32, i32, i32, i32, ...) bitcast (i32 (...)* @hoge to i32 (i32, i32, i32, i32, i32, i32, i32, ...)*)(i32 signext 1, i32 signext 2, i32 signext 3, i32 signext 4, i32 signext 5, i32 signext 6, i32 signext 7) hoge(1, 2, 3, 4, 5, 6, 7); } diff --git a/clang/test/CodeGenCXX/Inputs/profile-remap.proftext b/clang/test/CodeGenCXX/Inputs/profile-remap.proftext index a1f90cfa6e9ed..bf57fc696c49e 100644 --- a/clang/test/CodeGenCXX/Inputs/profile-remap.proftext +++ b/clang/test/CodeGenCXX/Inputs/profile-remap.proftext @@ -1,6 +1,6 @@ :ir _ZN3Foo8functionENS_1XE -29667547796 +146835647075900052 2 10 90 diff --git a/clang/test/CodeGenCXX/Inputs/profile-remap_entry.proftext b/clang/test/CodeGenCXX/Inputs/profile-remap_entry.proftext index 65bc6ff46711b..89772cd7644f2 100644 --- a/clang/test/CodeGenCXX/Inputs/profile-remap_entry.proftext +++ b/clang/test/CodeGenCXX/Inputs/profile-remap_entry.proftext @@ -1,7 +1,7 @@ :ir :entry_first _ZN3Foo8functionENS_1XE -29667547796 +146835647075900052 2 100 90 diff --git a/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp b/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp index 60c5970cb94f4..e689afa620924 100644 --- a/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp +++ b/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp @@ -1,5 +1,11 @@ -// RUN: %clang_cc1 -std=c++2a -emit-llvm %s -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ITANIUM -// RUN: %clang_cc1 -std=c++2a -emit-llvm %s -triple x86_64-windows -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MSABI +// RUN: %clang_cc1 -std=c++2a -emit-llvm %s -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ITANIUM,CHECK-64BIT +// RUN: %clang_cc1 -std=c++2a -emit-llvm %s -triple x86_64-windows -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MSABI,CHECK-MSABI64,CHECK-64BIT +// RUN: %clang_cc1 -std=c++2a -emit-llvm %s -triple i386-windows -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MSABI,CHECK-MSABI32,CHECK-32BIT + +// PR46908: ensure the IR passes the verifier with optimizations enabled. +// RUN: %clang_cc1 -std=c++2a -emit-llvm-only %s -triple x86_64-linux-gnu -O2 +// RUN: %clang_cc1 -std=c++2a -emit-llvm-only %s -triple x86_64-windows -O2 +// RUN: %clang_cc1 -std=c++2a -emit-llvm-only %s -triple i386-windows -O2 namespace std { using size_t = decltype(sizeof(0)); @@ -21,7 +27,8 @@ void delete_A(A *a) { delete a; } // Ensure that we call the destroying delete and not the destructor. // CHECK-NOT: call // CHECK-ITANIUM: call void @_ZN1AdlEPS_St19destroying_delete_t(%{{.*}}* %[[a]]) -// CHECK-MSABI: call void @"??3A@@SAXPEAU0@Udestroying_delete_t@std@@@Z"(%{{.*}}* %[[a]], i8 +// CHECK-MSABI64: call void @"??3A@@SAXPEAU0@Udestroying_delete_t@std@@@Z"(%{{.*}}* %[[a]], i8 +// CHECK-MSABI32: call void @"??3A@@SAXPAU0@Udestroying_delete_t@std@@@Z"(%{{.*}}* %[[a]], %{{.*}}* byval(%{{.*}}) align 4 %{{.*}}) // CHECK-NOT: call // CHECK: } @@ -39,7 +46,7 @@ void delete_B(B *b) { delete b; } // CHECK-NOT: call // CHECK: %[[VTABLE:.*]] = load // CHECK: %[[DTOR:.*]] = load -// CHECK: call {{void|i8\*}} %[[DTOR]](%{{.*}}* %[[b]] +// CHECK: call {{void|i8\*|x86_thiscallcc i8\*}} %[[DTOR]](%{{.*}}* %[[b]] // CHECK-MSABI-SAME: , i32 1) // CHECK-NOT: call // CHECK: } @@ -56,7 +63,8 @@ void delete_C(C *c) { delete c; } // CHECK: icmp eq %{{.*}} %[[c]], null // CHECK: br i1 // -// CHECK: %[[base:.*]] = getelementptr {{.*}}, i64 8 +// CHECK-64BIT: %[[base:.*]] = getelementptr {{.*}}, i64 8 +// CHECK-32BIT: %[[base:.*]] = getelementptr {{.*}}, i32 4 // CHECK: %[[castbase:.*]] = bitcast {{.*}} %[[base]] // // CHECK: %[[a:.*]] = phi {{.*}} %[[castbase]] @@ -65,7 +73,8 @@ void delete_C(C *c) { delete c; } // // CHECK-NOT: call // CHECK-ITANIUM: call void @_ZN1AdlEPS_St19destroying_delete_t(%{{.*}}* %[[a]]) -// CHECK-MSABI: call void @"??3A@@SAXPEAU0@Udestroying_delete_t@std@@@Z"(%{{.*}}* %[[a]], i8 +// CHECK-MSABI64: call void @"??3A@@SAXPEAU0@Udestroying_delete_t@std@@@Z"(%{{.*}}* %[[a]], i8 +// CHECK-MSABI32: call void @"??3A@@SAXPAU0@Udestroying_delete_t@std@@@Z"(%{{.*}}* %[[a]], %{{.*}}* byval(%{{.*}}) align 4 %{{.*}}) // CHECK-NOT: call // CHECK: } @@ -81,13 +90,14 @@ void delete_D(D *d) { delete d; } // For MS, we don't add a new vtable slot to the primary vtable for the virtual // destructor. Instead we cast to the VDel base class. // CHECK-MSABI: bitcast {{.*}} %[[d]] -// CHECK-MSABI-NEXT: getelementptr {{.*}}, i64 8 +// CHECK-MSABI64-NEXT: getelementptr {{.*}}, i64 8 +// CHECK-MSABI32-NEXT: getelementptr {{.*}}, i32 4 // CHECK-MSABI-NEXT: %[[d:.*]] = bitcast i8* // // CHECK: %[[VTABLE:.*]] = load // CHECK: %[[DTOR:.*]] = load // -// CHECK: call {{void|i8\*}} %[[DTOR]](%{{.*}}* %[[d]] +// CHECK: call {{void|i8\*|x86_thiscallcc i8\*}} %[[DTOR]](%{{.*}}* %[[d]] // CHECK-MSABI-SAME: , i32 1) // CHECK-NOT: call // CHECK: } @@ -100,7 +110,8 @@ void delete_G(G *g) { delete g; } // CHECK-LABEL: define {{.*}}delete_G // CHECK-NOT: call // CHECK-ITANIUM: call void @_ZN1FdlEPS_St19destroying_delete_tmSt11align_val_t(%{{.*}}* %[[a]], i64 32, i64 16) -// CHECK-MSABI: call void @"??3F@@SAXPEAU0@Udestroying_delete_t@std@@_KW4align_val_t@2@@Z"(%{{.*}}* %[[a]], i8 {{[^,]*}}, i64 32, i64 16) +// CHECK-MSABI64: call void @"??3F@@SAXPEAU0@Udestroying_delete_t@std@@_KW4align_val_t@2@@Z"(%{{.*}}* %[[a]], i8 {{[^,]*}}, i64 32, i64 16) +// CHECK-MSABI32: call void @"??3F@@SAXPAU0@Udestroying_delete_t@std@@IW4align_val_t@2@@Z"(%{{.*}}* %[[a]], %{{.*}}* byval(%{{.*}}) align 4 %{{.*}}, i32 16, i32 16) // CHECK-NOT: call // CHECK: } @@ -116,19 +127,23 @@ H::~H() { call_in_dtor(); } // CHECK-ITANIUM-NOT: call // CHECK-ITANIUM: } -// CHECK-MSABI: define {{.*}} @"??_GH@@UEAAPEAXI@Z"( +// CHECK-MSABI64: define {{.*}} @"??_GH@@UEAAPEAXI@Z"( +// CHECK-MSABI32: define {{.*}} @"??_GH@@UAEPAXI@Z"( // CHECK-MSABI-NOT: call{{ }} // CHECK-MSABI: load i32 // CHECK-MSABI: icmp eq i32 {{.*}}, 0 // CHECK-MSABI: br i1 // // CHECK-MSABI-NOT: call{{ }} -// CHECK-MSABI: getelementptr {{.*}}, i64 24 +// CHECK-MSABI64: getelementptr {{.*}}, i64 24 +// CHECK-MSABI32: getelementptr {{.*}}, i32 20 // CHECK-MSABI-NOT: call{{ }} -// CHECK-MSABI: call void @"??3F@@SAXPEAU0@Udestroying_delete_t@std@@_KW4align_val_t@2@@Z"({{.*}}, i64 48, i64 16) +// CHECK-MSABI64: call void @"??3F@@SAXPEAU0@Udestroying_delete_t@std@@_KW4align_val_t@2@@Z"({{.*}}, i64 48, i64 16) +// CHECK-MSABI32: call void @"??3F@@SAXPAU0@Udestroying_delete_t@std@@IW4align_val_t@2@@Z"({{.*}}, i32 32, i32 16) // CHECK-MSABI: br label %[[RETURN:.*]] // -// CHECK-MSABI: call void @"??1H@@UEAA@XZ"( +// CHECK-MSABI64: call void @"??1H@@UEAA@XZ"( +// CHECK-MSABI32: call x86_thiscallcc void @"??1H@@UAE@XZ"( // CHECK-MSABI: br label %[[RETURN]] // // CHECK-MSABI: } @@ -143,19 +158,23 @@ I::~I() { call_in_dtor(); } // CHECK-ITANIUM-NOT: call // CHECK-ITANIUM: } -// CHECK-MSABI: define {{.*}} @"??_GI@@UEAAPEAXI@Z"( +// CHECK-MSABI64: define {{.*}} @"??_GI@@UEAAPEAXI@Z"( +// CHECK-MSABI32: define {{.*}} @"??_GI@@UAEPAXI@Z"( // CHECK-MSABI-NOT: call{{ }} // CHECK-MSABI: load i32 // CHECK-MSABI: icmp eq i32 {{.*}}, 0 // CHECK-MSABI: br i1 // // CHECK-MSABI-NOT: call{{ }} -// CHECK-MSABI: getelementptr {{.*}}, i64 24 +// CHECK-MSABI64: getelementptr {{.*}}, i64 24 +// CHECK-MSABI32: getelementptr {{.*}}, i32 20 // CHECK-MSABI-NOT: call{{ }} -// CHECK-MSABI: call void @"??3F@@SAXPEAU0@Udestroying_delete_t@std@@_KW4align_val_t@2@@Z"({{.*}}, i64 96, i64 32) +// CHECK-MSABI64: call void @"??3F@@SAXPEAU0@Udestroying_delete_t@std@@_KW4align_val_t@2@@Z"({{.*}}, i64 96, i64 32) +// CHECK-MSABI32: call void @"??3F@@SAXPAU0@Udestroying_delete_t@std@@IW4align_val_t@2@@Z"({{.*}}, i32 64, i32 32) // CHECK-MSABI: br label %[[RETURN:.*]] // -// CHECK-MSABI: call void @"??1I@@UEAA@XZ"( +// CHECK-MSABI64: call void @"??1I@@UEAA@XZ"( +// CHECK-MSABI32: call x86_thiscallcc void @"??1I@@UAE@XZ"( // CHECK-MSABI: br label %[[RETURN]] // // CHECK-MSABI: } diff --git a/clang/test/CodeGenCXX/debug-info-limited-ctor.cpp b/clang/test/CodeGenCXX/debug-info-limited-ctor.cpp index 25b4ebdb54a32..89dd2b16b75b0 100644 --- a/clang/test/CodeGenCXX/debug-info-limited-ctor.cpp +++ b/clang/test/CodeGenCXX/debug-info-limited-ctor.cpp @@ -24,3 +24,10 @@ D::D() {} struct E { constexpr E(){}; } TestE; + +// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "F"{{.*}}DIFlagTypePassByValue +struct F { + F() = default; + F(int) {} + int i; +} TestF; diff --git a/clang/test/CodeGenCXX/fp16-mangle-arg-return.cpp b/clang/test/CodeGenCXX/fp16-mangle-arg-return.cpp new file mode 100644 index 0000000000000..15214e13ad8a7 --- /dev/null +++ b/clang/test/CodeGenCXX/fp16-mangle-arg-return.cpp @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -emit-llvm -o - -triple arm-arm-none-eabi -fallow-half-arguments-and-returns %s | FileCheck %s +// RUN: %clang_cc1 -emit-llvm -o - -triple aarch64-arm-none-eabi -fallow-half-arguments-and-returns %s | FileCheck %s + +// Test name-mangling of __fp16 passed directly as a function argument +// (when that is permitted). + +// CHECK: define {{.*}}void @_Z13fp16_argumentDh(half %{{.*}}) +void fp16_argument(__fp16 arg) {} + +// Test name-mangling of __fp16 as a return type. The return type of +// fp16_return itself isn't mentioned in the mangled name, so to test +// this, we have to pass it a function pointer and make __fp16 the +// return type of that. + +// CHECK: define {{.*}}void @_Z11fp16_returnPFDhvE(half ()* %{{.*}}) +void fp16_return(__fp16 (*func)(void)) {} diff --git a/clang/test/CodeGenCXX/mangle-address-space.cpp b/clang/test/CodeGenCXX/mangle-address-space.cpp index 71bfb68b9a4ab..600e64ef7e673 100644 --- a/clang/test/CodeGenCXX/mangle-address-space.cpp +++ b/clang/test/CodeGenCXX/mangle-address-space.cpp @@ -66,11 +66,11 @@ __constant float *ocl_f1(char __generic const *p) { return 0;} // WINOCL-LABEL: define {{.*}}float* @"?ocl_f2@@YAPEAU?$_ASCLgeneric@$$CAM@__clang@@QEAU?$_ASCLgeneric@$$CBD@2@@Z" __generic float *ocl_f2(__generic char const * const p) { return 0;} -// CHECKOCL-LABEL: define {{.*}}void @_Z6ocl_f3PU6CLHost14ocl_OpaqueType -// WINOCL-LABEL: define {{.*}}void @"?ocl_f3@@YAXPEAU?$_ASCLHost@$$CAUocl_OpaqueType@@@__clang@@@Z" +// CHECKOCL-LABEL: define {{.*}}void @_Z6ocl_f3PU6CLhost14ocl_OpaqueType +// WINOCL-LABEL: define {{.*}}void @"?ocl_f3@@YAXPEAU?$_ASCLhost@$$CAUocl_OpaqueType@@@__clang@@@Z" void ocl_f3(ocl_OpaqueTypePtrH) {} -// CHECKOCL-LABEL: define {{.*}}void @_Z6ocl_f4PU8CLDevice14ocl_OpaqueType -// WINOCL-LABEL: define {{.*}}void @"?ocl_f4@@YAXPEAU?$_ASCLDevice@$$CAUocl_OpaqueType@@@__clang@@@Z" +// CHECKOCL-LABEL: define {{.*}}void @_Z6ocl_f4PU8CLdevice14ocl_OpaqueType +// WINOCL-LABEL: define {{.*}}void @"?ocl_f4@@YAXPEAU?$_ASCLdevice@$$CAUocl_OpaqueType@@@__clang@@@Z" void ocl_f4(ocl_OpaqueTypePtrD) {} #endif diff --git a/clang/test/CodeGenObjC/arc-unsafeclaim.m b/clang/test/CodeGenObjC/arc-unsafeclaim.m index a8011e024180d..40f1f164455a7 100644 --- a/clang/test/CodeGenObjC/arc-unsafeclaim.m +++ b/clang/test/CodeGenObjC/arc-unsafeclaim.m @@ -1,16 +1,16 @@ // Make sure it works on x86-64. -// RUN: %clang_cc1 -triple x86_64-apple-darwin11 -fobjc-runtime=macosx-10.11 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED +// RUN: %clang_cc1 -triple x86_64-apple-darwin11 -fobjc-runtime=macosx-10.11 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=NOTAIL-CALL // Make sure it works on x86-32. -// RUN: %clang_cc1 -triple i386-apple-darwin11 -fobjc-runtime=macosx-fragile-10.11 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=CHECK-MARKED +// RUN: %clang_cc1 -triple i386-apple-darwin11 -fobjc-runtime=macosx-fragile-10.11 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=CHECK-MARKED -check-prefix=CALL // Make sure it works on ARM. -// RUN: %clang_cc1 -triple arm64-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=CHECK-MARKED -// RUN: %clang_cc1 -triple arm64-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -O -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPTIMIZED +// RUN: %clang_cc1 -triple arm64-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=CHECK-MARKED -check-prefix=CALL +// RUN: %clang_cc1 -triple arm64-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -O -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPTIMIZED -check-prefix=CALL // Make sure it works on ARM64. -// RUN: %clang_cc1 -triple armv7-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=CHECK-MARKED -// RUN: %clang_cc1 -triple armv7-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -O -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPTIMIZED +// RUN: %clang_cc1 -triple armv7-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=CHECK-MARKED -check-prefix=CALL +// RUN: %clang_cc1 -triple armv7-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -O -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPTIMIZED -check-prefix=CALL // Make sure that it's implicitly disabled if the runtime version isn't high enough. // RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fobjc-runtime=macosx-10.10 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=DISABLED @@ -29,7 +29,8 @@ void test_assign() { // CHECK: [[T0:%.*]] = call [[A:.*]]* @makeA() // CHECK-MARKED-NEXT: call void asm sideeffect // CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8* -// CHECK-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// NOTAIL-CALL-NEXT: [[T2:%.*]] = notail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// CALL-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) // CHECK-NEXT: [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]* // CHECK-NEXT: [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8* // CHECK-NEXT: store i8* [[T4]], i8** [[X]] @@ -53,7 +54,8 @@ void test_assign_assign() { // CHECK: [[T0:%.*]] = call [[A]]* @makeA() // CHECK-MARKED-NEXT: call void asm sideeffect // CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8* -// CHECK-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// NOTAIL-CALL-NEXT: [[T2:%.*]] = notail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// CALL-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) // CHECK-NEXT: [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]* // CHECK-NEXT: [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8* // CHECK-NEXT: store i8* [[T4]], i8** [[Y]] @@ -126,7 +128,8 @@ void test_init() { // CHECK: [[T0:%.*]] = call [[A]]* @makeA() // CHECK-MARKED-NEXT: call void asm sideeffect // CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8* -// CHECK-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// NOTAIL-CALL-NEXT: [[T2:%.*]] = notail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// CALL-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) // CHECK-NEXT: [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]* // CHECK-NEXT: [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8* // CHECK-NEXT: store i8* [[T4]], i8** [[X]] @@ -144,7 +147,8 @@ void test_init_assignment() { // CHECK: [[T0:%.*]] = call [[A]]* @makeA() // CHECK-MARKED-NEXT: call void asm sideeffect // CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8* -// CHECK-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// NOTAIL-CALL-NEXT: [[T2:%.*]] = notail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// CALL-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) // CHECK-NEXT: [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]* // CHECK-NEXT: [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8* // CHECK-NEXT: store i8* [[T4]], i8** [[X]] @@ -212,7 +216,8 @@ void test_ignored() { // CHECK: [[T0:%.*]] = call [[A]]* @makeA() // CHECK-MARKED-NEXT: call void asm sideeffect // CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8* -// CHECK-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// NOTAIL-CALL-NEXT: [[T2:%.*]] = notail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// CALL-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) // CHECK-NEXT: bitcast i8* [[T2]] to [[A]]* // CHECK-NEXT: ret void @@ -223,7 +228,8 @@ void test_cast_to_void() { // CHECK: [[T0:%.*]] = call [[A]]* @makeA() // CHECK-MARKED-NEXT: call void asm sideeffect // CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8* -// CHECK-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// NOTAIL-CALL-NEXT: [[T2:%.*]] = notail call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) +// CALL-NEXT: [[T2:%.*]] = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* [[T1]]) // CHECK-NEXT: bitcast i8* [[T2]] to [[A]]* // CHECK-NEXT: ret void diff --git a/clang/test/CodeGenOpenCL/address-spaces-conversions.cl b/clang/test/CodeGenOpenCL/address-spaces-conversions.cl index 52feccc540bb8..39cc86fef7509 100644 --- a/clang/test/CodeGenOpenCL/address-spaces-conversions.cl +++ b/clang/test/CodeGenOpenCL/address-spaces-conversions.cl @@ -6,7 +6,9 @@ // pointers to different address spaces // CHECK: define void @test -void test(global int *arg_glob, generic int *arg_gen) { +void test(global int *arg_glob, generic int *arg_gen, + __attribute__((opencl_global_device)) int *arg_device, + __attribute__((opencl_global_host)) int *arg_host) { int var_priv; arg_gen = arg_glob; // implicit cast global -> generic // CHECK: %{{[0-9]+}} = addrspacecast i32 addrspace(1)* %{{[0-9]+}} to i32 addrspace(4)* @@ -39,6 +41,30 @@ void test(global int *arg_glob, generic int *arg_gen) { // CHECK-NOT: bitcast // CHECK-NOFAKE: bitcast // CHECK-NOFAKE-NOT: addrspacecast + + arg_glob = arg_device; // implicit cast + // CHECK: addrspacecast + // CHECK-NOFAKE-NOT: addrspacecast + + arg_glob = arg_host; // implicit cast + // CHECK: addrspacecast + // CHECK-NOFAKE-NOT: addrspacecast + + arg_glob = (global int *)arg_device; // explicit cast + // CHECK: addrspacecast + // CHECK-NOFAKE-NOT: addrspacecast + + arg_glob = (global int *)arg_host; // explicit cast + // CHECK: addrspacecast + // CHECK-NOFAKE-NOT: addrspacecast + + arg_device = (__attribute((opencl_global_device)) int *)arg_glob; // explicit cast + // CHECK: addrspacecast + // CHECK-NOFAKE-NOT: addrspacecast + + arg_host = (__attribute((opencl_global_host)) int *)arg_glob; // explicit cast + // CHECK: addrspacecast + // CHECK-NOFAKE-NOT: addrspacecast } // Test ternary operator. diff --git a/clang/test/CodeGenOpenCL/address-spaces.cl b/clang/test/CodeGenOpenCL/address-spaces.cl index 144466a690b3f..ebd227a6a9c7d 100644 --- a/clang/test/CodeGenOpenCL/address-spaces.cl +++ b/clang/test/CodeGenOpenCL/address-spaces.cl @@ -51,6 +51,14 @@ void fl(local int *arg) {} // AMDGCN: i32 addrspace(4)* %arg void fc(constant int *arg) {} +// SPIR: i32 addrspace(5)* %arg +// AMDGCN: i32 addrspace(1)* %arg +void fd(__attribute__((opencl_global_device)) int *arg) {} + +// SPIR: i32 addrspace(6)* %arg +// AMDGCN: i32 addrspace(1)* %arg +void fh(__attribute__((opencl_global_host)) int *arg) {} + #ifdef CL20 int i; // CL20-DAG: @i = {{(dso_local )?}}addrspace(1) global i32 0 diff --git a/clang/test/CodeGenSYCL/union_kernel_param.cpp b/clang/test/CodeGenSYCL/union_kernel_param.cpp new file mode 100644 index 0000000000000..85a9e4f0c8c42 --- /dev/null +++ b/clang/test/CodeGenSYCL/union_kernel_param.cpp @@ -0,0 +1,58 @@ +// RUN: %clang_cc1 -I %S/Inputs -fsycl -fsycl-is-device -triple spir64-unknown-unknown-sycldevice -fsycl-int-header=%t.h %s -o %t.out +// RUN: FileCheck -input-file=%t.h %s + +// This test checks the integration header generated when +// the kernel argument is union. + +// CHECK: #include + +// CHECK: class MyKernel; + +// CHECK: __SYCL_INLINE_NAMESPACE(cl) { +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: namespace detail { + +// CHECK: static constexpr +// CHECK-NEXT: const char* const kernel_names[] = { +// CHECK-NEXT: "_ZTSZZ5test0vENK3$_0clERN2cl4sycl7handlerEE8MyKernel" +// CHECK-NEXT: }; + +// CHECK: static constexpr +// CHECK-NEXT: const kernel_param_desc_t kernel_signatures[] = { +// CHECK-NEXT: //--- _ZTSZZ5test0vENK3$_0clERN2cl4sycl7handlerEE8MyKernel +// CHECK-NEXT: { kernel_param_kind_t::kind_accessor, 4062, 0 }, +// CHECK-NEXT: { kernel_param_kind_t::kind_std_layout, 12, 12 }, +// CHECK-EMPTY: +// CHECK-NEXT: }; + +// CHECK: static constexpr +// CHECK-NEXT: const unsigned kernel_signature_start[] = { +// CHECK-NEXT: 0 // _ZTSZZ5test0vENK3$_0clERN2cl4sycl7handlerEE8MyKernel +// CHECK-NEXT: }; + +// CHECK: template <> struct KernelInfo { + +#include "sycl.hpp" + +using namespace cl::sycl; + +union MyUnion { + int FldInt; + int FldArr[3]; +}; + +MyUnion GlobS; + +bool test0() { + MyUnion S = GlobS; + MyUnion S0 = {0}; + { + buffer Buf(&S0, range<1>(1)); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto B = Buf.get_access(cgh); + cgh.single_task([=] { B; S; }); + }); + } +} + diff --git a/clang/test/CodeGenSYCL/union_kernel_param1.cpp b/clang/test/CodeGenSYCL/union_kernel_param1.cpp new file mode 100644 index 0000000000000..34b4120a4f04d --- /dev/null +++ b/clang/test/CodeGenSYCL/union_kernel_param1.cpp @@ -0,0 +1,70 @@ +// RUN: %clang_cc1 -fsycl -fsycl-is-device -I %S/Inputs -triple spir64-unknown-unknown-sycldevice -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s + +// This test checks if compiler accepts union as kernel parameters. + +#include "sycl.hpp" + +using namespace cl::sycl; + +union MyUnion { + int FldInt; + int FldArr[3]; +}; + +MyUnion GlobS; + +bool test0() { + MyUnion S = GlobS; + MyUnion S0 = {0}; + { + buffer Buf(&S0, range<1>(1)); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto B = Buf.get_access(cgh); + cgh.single_task([=] { B; S; }); + }); + } +} + +// CHECK MyKernel parameters +// CHECK: define spir_kernel void @{{.*}}MyKernel +// CHECK-SAME: %union.{{.*}}.MyUnion addrspace(1)* [[MEM_ARG1:%[a-zA-Z0-9_]+]], +// CHECK-SAME: %"struct.{{.*}}.cl::sycl::range"* byval({{.*}}) align 4 [[MEM_ARG2:%[a-zA-Z0-9_]+1]], +// CHECK-SAME: %"struct.{{.*}}.cl::sycl::range"* byval({{.*}}) align 4 [[MEM_ARG3:%[a-zA-Z0-9_]+2]], +// CHECK-SAME: %"struct.{{.*}}.cl::sycl::id"* byval({{.*}}) align 4 [[OFFSET1:%[a-zA-Z0-9_]+3]], +// CHECK-SAME: %union.{{.*}}.MyUnion* byval({{.*}}) align 4 [[MEM_ARG4:%[a-zA-Z0-9_]+4]]) + +// Check alloca for pointer arguments +// CHECK: [[MEM_ARG1]].addr{{[0-9]*}} = alloca %union._ZTS7MyUnion.MyUnion addrspace(1)*, align 8 + +// Check lambda object alloca +// CHECK: [[LOCAL_OBJECT:%0]] = alloca %"class.{{.*}}.anon", align 4 + +// Check allocas for ranges +// CHECK: [[ACC_RANGE1:%[a-zA-Z0-9_.]+]] = alloca %"struct.{{.*}}.cl::sycl::range" +// CHECK: [[ACC_RANGE2:%[a-zA-Z0-9_.]+]] = alloca %"struct.{{.*}}.cl::sycl::range" +// CHECK: [[OFFSET2:%[a-zA-Z0-9_.]+]] = alloca %"struct.{{.*}}.cl::sycl::id" + +// CHECK: [[L_STRUCT_ADDR:%[a-zA-Z0-9_]+]] = getelementptr inbounds %"class.{{.*}}.anon", %"class.{{.*}}.anon"* [[LOCAL_OBJECT]], i32 0, i32 0 +// CHECK: [[ACC_CAST1:%[0-9]+]] = addrspacecast %"class{{.*}}accessor"* [[L_STRUCT_ADDR]] to %"class{{.*}}accessor" addrspace(4)* +// CHECK: call spir_func void @{{.*}}MyUnion{{.*}}(%"class.{{.*}}.cl::sycl::accessor" addrspace(4)* [[ACC_CAST1]]) +// CHECK: [[Z0:%[a-zA-Z0-9_]*]] = getelementptr inbounds %"class.{{.*}}.anon", %"class.{{.*}}.anon"* [[LOCAL_OBJECT]], i32 0, i32 1 +// CHECK: [[MEMCPY_DST:%[0-9a-zA-Z_]+]] = bitcast %union.{{.*}}MyUnion* [[Z0]] to i8* +// CHECK: [[MEMCPY_SRC:%[0-9a-zA-Z_]+]] = bitcast %union.{{.*}}MyUnion* [[MEM_ARG4]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[MEMCPY_DST]], i8* align 4 [[MEMCPY_SRC]], i64 12, i1 false) +// CHECK: [[Z1:%[a-zA-Z0-9_]*]] = getelementptr inbounds %"class.{{.*}}.anon", %"class.{{.*}}.anon"* [[LOCAL_OBJECT]], i32 0, i32 0 + +// Check load from kernel pointer argument alloca +// CHECK: [[MEM_LOAD1:%[a-zA-Z0-9_]+]] = load %union._ZTS7MyUnion.MyUnion addrspace(1)*, %union._ZTS7MyUnion.MyUnion addrspace(1)** [[MEM_ARG1]].addr{{[0-9]*}}, align 8 +// CHECK: [[MEMCPY_DST1:%[0-9a-zA-Z_]+]] = bitcast %"struct.{{.*}}.cl::sycl::range"* [[ACC_RANGE1]] to i8* +// CHECK: [[MEMCPY_SRC1:%[0-9a-zA-Z_]+]] = bitcast %"struct.{{.*}}.cl::sycl::range"* [[MEM_ARG2]] to i8* +// call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[MEMCPY_DST1]], i8* align 4 [[MEMCPY_SRC1]], i64 4, i1 false), !tbaa.struct [[ACC_CAST2:%[0-9]+]] +// CHECK: [[MEMCPY_DST2:%[0-9a-zA-Z_]+]] = bitcast %"struct.{{.*}}.cl::sycl::range"* [[ACC_RANGE2]] to i8* +// CHECK: [[MEMCPY_SRC2:%[0-9a-zA-Z_]+]] = bitcast %"struct.{{.*}}.cl::sycl::range"* [[MEM_ARG3]] to i8* +// call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[MEMCPY_DST2]], i8* align 4 [[MEMCPY_SRC2]], i64 4, i1 false), !tbaa.struct [[ACC_CAST2:%[0-9]+]] + +// Check __init method call +// CHECK: [[ACC_CAST1:%[0-9]+]] = addrspacecast %"class{{.*}}accessor"* [[Z1]] to %"class{{.*}}accessor" addrspace(4)* +// CHECK: call spir_func void @{{.*}}__init{{.*}}(%"class.{{.*}}.cl::sycl::accessor" addrspace(4)* [[ACC_CAST1]], %union._ZTS7MyUnion.MyUnion addrspace(1)* [[MEM_LOAD1]], %"struct.{{.*}}.cl::sycl::range"* byval({{.*}}) align 4 [[ACC_RANGE1]], %"struct.{{.*}}.cl::sycl::range"* byval({{.*}}) align 4 [[ACC_RANGE2]], %"struct.{{.*}}.cl::sycl::id"* byval({{.*}}) align 4 [[OFFSET2]]) +// CHECK: [[ACC_CAST2:%[0-9]+]] = addrspacecast %"class{{.*}}.anon"* [[LOCAL_OBJECT]] to %"class{{.*}}.anon" addrspace(4)* +// CHECK: call spir_func void @{{.*}}(%"class.{{.*}}.anon" addrspace(4)* [[ACC_CAST2]]) diff --git a/clang/test/CoverageMapping/break.c b/clang/test/CoverageMapping/break.c index 08461d7ed2de2..191f34765a65b 100644 --- a/clang/test/CoverageMapping/break.c +++ b/clang/test/CoverageMapping/break.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name break.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name break.c %t.stripped.c | FileCheck %s int main() { // CHECK: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0 int cnt = 0; // CHECK-NEXT: File 0, [[@LINE+1]]:9 -> [[@LINE+1]]:18 = #0 diff --git a/clang/test/CoverageMapping/builtinmacro.c b/clang/test/CoverageMapping/builtinmacro.c index 63f5584d40c2c..0de0e8e6a1252 100644 --- a/clang/test/CoverageMapping/builtinmacro.c +++ b/clang/test/CoverageMapping/builtinmacro.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name builtinmacro.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name builtinmacro.c %t.stripped.c | FileCheck %s // Test the coverage mapping generation for built-in macroes. diff --git a/clang/test/CoverageMapping/classtemplate.cpp b/clang/test/CoverageMapping/classtemplate.cpp index 0dbb0c0ede841..9250069e04f65 100644 --- a/clang/test/CoverageMapping/classtemplate.cpp +++ b/clang/test/CoverageMapping/classtemplate.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name classtemplate.cpp %s > %tmapping +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name classtemplate.cpp %t.stripped.cpp > %tmapping // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-CONSTRUCTOR // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-GETTER // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-SETTER diff --git a/clang/test/CoverageMapping/comment-in-macro.c b/clang/test/CoverageMapping/comment-in-macro.c index 06e8adbc41ee2..86b554f8bd50c 100644 --- a/clang/test/CoverageMapping/comment-in-macro.c +++ b/clang/test/CoverageMapping/comment-in-macro.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %t.stripped.c | FileCheck %s #define x1 "" // ... #define x2 return 0 @@ -7,5 +8,5 @@ int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+3]]:2 = #0 x1; // CHECK-NEXT: Expansion,File 0, [[@LINE]]:3 -> [[@LINE]]:5 = #0 x2; // CHECK-NEXT: Expansion,File 0, [[@LINE]]:3 -> [[@LINE]]:5 = #0 } -// CHECK-NEXT: File 1, 3:12 -> 3:14 = #0 -// CHECK-NEXT: File 2, 4:12 -> 4:20 = #0 +// CHECK-NEXT: File 1, 4:12 -> 4:14 = #0 +// CHECK-NEXT: File 2, 5:12 -> 5:20 = #0 diff --git a/clang/test/CoverageMapping/continue.c b/clang/test/CoverageMapping/continue.c index 9864c912f239c..774fe2ee6fffd 100644 --- a/clang/test/CoverageMapping/continue.c +++ b/clang/test/CoverageMapping/continue.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name continue.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name continue.c %t.stripped.c | FileCheck %s int main() { // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+21]]:2 = #0 int j = 0; // CHECK-NEXT: File 0, [[@LINE+2]]:18 -> [[@LINE+2]]:24 = (#0 + #1) diff --git a/clang/test/CoverageMapping/coroutine.cpp b/clang/test/CoverageMapping/coroutine.cpp index dc9473348fc90..a614e6e8ea402 100644 --- a/clang/test/CoverageMapping/coroutine.cpp +++ b/clang/test/CoverageMapping/coroutine.cpp @@ -1,6 +1,7 @@ // fixme: the following line is added to cleanup bots, will be removed in weeks. // RUN: rm -f %S/coroutine.ll -// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcoroutines-ts -std=c++14 -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping %s -o - | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcoroutines-ts -std=c++14 -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping %t.stripped.cpp -o - | FileCheck %s namespace std::experimental { template diff --git a/clang/test/CoverageMapping/deferred-region.cpp b/clang/test/CoverageMapping/deferred-region.cpp index 45113e46f02b2..8db6119472dbc 100644 --- a/clang/test/CoverageMapping/deferred-region.cpp +++ b/clang/test/CoverageMapping/deferred-region.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -fexceptions -fcxx-exceptions -emit-llvm-only -triple %itanium_abi_triple -main-file-name deferred-region.cpp -I %S/Inputs %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -fexceptions -fcxx-exceptions -emit-llvm-only -triple %itanium_abi_triple -main-file-name deferred-region.cpp -I %S/Inputs %t.stripped.cpp | FileCheck %s #define IF if #define STMT(S) S diff --git a/clang/test/CoverageMapping/if.cpp b/clang/test/CoverageMapping/if.cpp index e3d6f4e25e573..8ffc09d29a3c7 100644 --- a/clang/test/CoverageMapping/if.cpp +++ b/clang/test/CoverageMapping/if.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name if.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name if.cpp %t.stripped.cpp | FileCheck %s int nop() { return 0; } diff --git a/clang/test/CoverageMapping/includehell.cpp b/clang/test/CoverageMapping/includehell.cpp index fd08d6af7f3da..c92f12e5e80d0 100644 --- a/clang/test/CoverageMapping/includehell.cpp +++ b/clang/test/CoverageMapping/includehell.cpp @@ -51,6 +51,7 @@ int main() { // CHECK-START: File [[START3]], 4:29 -> 5:1 = #9 // CHECK-CODE: File [[CODE1:[0-9]]], 1:1 -> 14:1 = #1 +// CHECK-CODE: Skipped,File [[CODE1]], 1:1 -> 1:41 = 0 // CHECK-CODE-NEXT: File [[CODE1]], 4:5 -> 4:11 = #1 // CHECK-CODE: File [[CODE1]], 4:13 -> 6:2 = #2 // CHECK-CODE: File [[CODE1]], 6:8 -> 8:2 = (#1 - #2) @@ -58,6 +59,7 @@ int main() { // CHECK-CODE: File [[CODE1]], 9:11 -> 11:2 = #3 // CHECK-CODE: File [[CODE1]], 11:8 -> 13:2 = (#1 - #3) // CHECK-CODE: File [[CODE2:[0-9]]], 1:1 -> 14:1 = #5 +// CHECK-CODE: Skipped,File [[CODE2]], 1:1 -> 1:41 = 0 // CHECK-CODE-NEXT: File [[CODE2]], 4:5 -> 4:11 = #5 // CHECK-CODE: File [[CODE2]], 4:13 -> 6:2 = #6 // CHECK-CODE: File [[CODE2]], 6:8 -> 8:2 = (#5 - #6) diff --git a/clang/test/CoverageMapping/label.cpp b/clang/test/CoverageMapping/label.cpp index 995835ad6b2ff..e77372d325e5a 100644 --- a/clang/test/CoverageMapping/label.cpp +++ b/clang/test/CoverageMapping/label.cpp @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name label.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name label.cpp %t.stripped.cpp | FileCheck %s - // CHECK: func +// CHECK: func void func() { // CHECK-NEXT: File 0, [[@LINE]]:13 -> {{[0-9]+}}:2 = #0 int i = 0; // CHECK-NEXT: File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:20 = (#0 + #3) // CHECK-NEXT: File 0, [[@LINE+1]]:22 -> [[@LINE+1]]:25 = #3 diff --git a/clang/test/CoverageMapping/logical.cpp b/clang/test/CoverageMapping/logical.cpp index bc7c785b7b550..cca03377c98a4 100644 --- a/clang/test/CoverageMapping/logical.cpp +++ b/clang/test/CoverageMapping/logical.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %t.stripped.cpp | FileCheck %s int main() { // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+15]]:2 = #0 bool bt = true; diff --git a/clang/test/CoverageMapping/loops.cpp b/clang/test/CoverageMapping/loops.cpp index ff7aafd66d944..498d214e69d9c 100644 --- a/clang/test/CoverageMapping/loops.cpp +++ b/clang/test/CoverageMapping/loops.cpp @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %t.stripped.cpp | FileCheck %s - // CHECK: rangedFor +// CHECK: rangedFor void rangedFor() { // CHECK-NEXT: File 0, [[@LINE]]:18 -> {{[0-9]+}}:2 = #0 int arr[] = { 1, 2, 3, 4, 5 }; int sum = 0; // CHECK: Gap,File 0, [[@LINE+1]]:20 -> [[@LINE+1]]:21 = #1 diff --git a/clang/test/CoverageMapping/macro-expressions.cpp b/clang/test/CoverageMapping/macro-expressions.cpp index 26d70c67fca08..60afc5238b9eb 100644 --- a/clang/test/CoverageMapping/macro-expressions.cpp +++ b/clang/test/CoverageMapping/macro-expressions.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expressions.cpp -w %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expressions.cpp -w %t.stripped.cpp | FileCheck %s #define EXPR(x) (x) #define NEXPR(x) (!x) #define DECL(T, x) T x diff --git a/clang/test/CoverageMapping/macroparams2.c b/clang/test/CoverageMapping/macroparams2.c index 4e04581b725e8..30ce25d9accda 100644 --- a/clang/test/CoverageMapping/macroparams2.c +++ b/clang/test/CoverageMapping/macroparams2.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams2.c %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams2.c %t.stripped.c | FileCheck %s #define MACRO(REFS, CALLS) (4 * (CALLS) < (REFS)) struct S { diff --git a/clang/test/CoverageMapping/macros.c b/clang/test/CoverageMapping/macros.c index 39cd190b2a882..83e2029be5612 100644 --- a/clang/test/CoverageMapping/macros.c +++ b/clang/test/CoverageMapping/macros.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macros.c %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macros.c %t.stripped.c | FileCheck %s #define MACRO return; bar() #define MACRO_2 bar() #define MACRO_1 return; MACRO_2 diff --git a/clang/test/CoverageMapping/macroscopes.cpp b/clang/test/CoverageMapping/macroscopes.cpp index 3f5f65e5ad7b5..62f5dbe77981f 100644 --- a/clang/test/CoverageMapping/macroscopes.cpp +++ b/clang/test/CoverageMapping/macroscopes.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroscopes.cpp %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroscopes.cpp %t.stripped.cpp | FileCheck %s #define starts_a_scope for (int i = 0; i < 2; ++i) { #define ends_a_scope \ diff --git a/clang/test/CoverageMapping/moremacros.c b/clang/test/CoverageMapping/moremacros.c index 5e0d4a6abf257..ed89dcafd6723 100644 --- a/clang/test/CoverageMapping/moremacros.c +++ b/clang/test/CoverageMapping/moremacros.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %t.stripped.c | FileCheck %s #define LBRAC { #define RBRAC } diff --git a/clang/test/CoverageMapping/objc.m b/clang/test/CoverageMapping/objc.m index 4e4c184f0a887..008d291aee31a 100644 --- a/clang/test/CoverageMapping/objc.m +++ b/clang/test/CoverageMapping/objc.m @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name objc.m -triple x86_64-apple-darwin -fobjc-runtime=macosx-fragile-10.5 -w %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.m +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name objc.m -triple x86_64-apple-darwin -fobjc-runtime=macosx-fragile-10.5 -w %t.stripped.m | FileCheck %s @interface A - (void)bork:(int)msg; diff --git a/clang/test/CoverageMapping/pr32679.cpp b/clang/test/CoverageMapping/pr32679.cpp index eac3afb6efb6c..c93133f8c2f2d 100644 --- a/clang/test/CoverageMapping/pr32679.cpp +++ b/clang/test/CoverageMapping/pr32679.cpp @@ -1,5 +1,6 @@ -// RUN: %clang_cc1 -cc1 -triple i686-pc-windows-msvc19.0.0 -emit-obj -fprofile-instrument=clang -std=c++14 -fdelayed-template-parsing -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %s | FileCheck %s -check-prefix=MSABI -implicit-check-not=f2 -// RUN: %clang_cc1 -cc1 -triple %itanium_abi_triple -emit-obj -fprofile-instrument=clang -std=c++14 -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %s | FileCheck %s -check-prefix=ITANIUM -implicit-check-not=f2 +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -cc1 -triple i686-pc-windows-msvc19.0.0 -emit-obj -fprofile-instrument=clang -std=c++14 -fdelayed-template-parsing -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %t.stripped.cpp | FileCheck %s -check-prefix=MSABI -implicit-check-not=f2 +// RUN: %clang_cc1 -cc1 -triple %itanium_abi_triple -emit-obj -fprofile-instrument=clang -std=c++14 -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name pr32679.cpp -o - %t.stripped.cpp | FileCheck %s -check-prefix=ITANIUM -implicit-check-not=f2 template struct CreateSpecialization; diff --git a/clang/test/CoverageMapping/preprocessor.c b/clang/test/CoverageMapping/preprocessor.c index 9225c9f162a20..b68aa2e0e93c6 100644 --- a/clang/test/CoverageMapping/preprocessor.c +++ b/clang/test/CoverageMapping/preprocessor.c @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name preprocessor.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name preprocessor.c %t.stripped.c | FileCheck %s - // CHECK: func +// CHECK: func void func() { // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+5]]:2 = #0 int i = 0; #ifdef MACRO // CHECK-NEXT: Skipped,File 0, [[@LINE]]:1 -> [[@LINE+2]]:7 = 0 @@ -11,7 +12,7 @@ void func() { // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+5]]:2 = #0 // CHECK: main int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0 int i = 0; -# if 0 // CHECK-NEXT: Skipped,File 0, [[@LINE]]:1 -> [[@LINE+4]]:29 = 0 +#if 0 // CHECK-NEXT: Skipped,File 0, [[@LINE]]:1 -> [[@LINE+4]]:9 = 0 if(i == 0) { i = 1; } @@ -29,7 +30,7 @@ int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0 } #endif - // CHECK-NEXT: Skipped,File 0, [[@LINE+1]]:1 -> [[@LINE+4]]:24 + // CHECK-NEXT: Skipped,File 0, [[@LINE+1]]:1 -> [[@LINE+4]]:8 #\ if 0 #\ @@ -59,7 +60,7 @@ int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0 #\ endif - // CHECK-NEXT: Skipped,File 0, [[@LINE+1]]:1 -> [[@LINE+6]]:26 + // CHECK-NEXT: Skipped,File 0, [[@LINE+1]]:1 -> [[@LINE+6]]:10 #\ ifdef NOT_DEFINED #\ diff --git a/clang/test/CoverageMapping/return.c b/clang/test/CoverageMapping/return.c index 440acb569b8fa..fae2e9f761b7a 100644 --- a/clang/test/CoverageMapping/return.c +++ b/clang/test/CoverageMapping/return.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name return.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name return.c %t.stripped.c | FileCheck %s // CHECK: func void func() { // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+3]]:2 = #0 diff --git a/clang/test/CoverageMapping/switch.cpp b/clang/test/CoverageMapping/switch.cpp index 25ea4053f4e2c..1a8db09b3ff14 100644 --- a/clang/test/CoverageMapping/switch.cpp +++ b/clang/test/CoverageMapping/switch.cpp @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name switch.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++1z -triple %itanium_abi_triple -main-file-name switch.cpp %t.stripped.cpp | FileCheck %s - // CHECK: foo +// CHECK: foo void foo(int i) { // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+8]]:2 = #0 switch(i) { // CHECK-NEXT: Gap,File 0, [[@LINE]]:13 -> [[@LINE+4]]:10 = 0 case 1: // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:11 = #2 diff --git a/clang/test/CoverageMapping/switchmacro.c b/clang/test/CoverageMapping/switchmacro.c index fc0392fb91e53..5c6a37e77f944 100644 --- a/clang/test/CoverageMapping/switchmacro.c +++ b/clang/test/CoverageMapping/switchmacro.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switchmacro.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switchmacro.c %t.stripped.c | FileCheck %s #define FOO(x) (void)x diff --git a/clang/test/CoverageMapping/test.c b/clang/test/CoverageMapping/test.c index ae73fcb3bbab2..559036a3be9be 100644 --- a/clang/test/CoverageMapping/test.c +++ b/clang/test/CoverageMapping/test.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name test.c %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name test.c %t.stripped.c | FileCheck %s void bar(); static void static_func(); diff --git a/clang/test/CoverageMapping/trycatch.cpp b/clang/test/CoverageMapping/trycatch.cpp index 5d284daaca017..ba1b26b7acee7 100644 --- a/clang/test/CoverageMapping/trycatch.cpp +++ b/clang/test/CoverageMapping/trycatch.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trycatch.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.cpp +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trycatch.cpp %t.stripped.cpp | FileCheck %s class Error { }; diff --git a/clang/test/CoverageMapping/unreachable-macro.c b/clang/test/CoverageMapping/unreachable-macro.c index b9d4f3616ffa5..b84acca33f938 100644 --- a/clang/test/CoverageMapping/unreachable-macro.c +++ b/clang/test/CoverageMapping/unreachable-macro.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s - +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %t.stripped.c | FileCheck %s #define WHILE while (0) {} // CHECK: counters_in_macro_following_unreachable diff --git a/clang/test/CoverageMapping/while.c b/clang/test/CoverageMapping/while.c index 616ecf69020d7..e0c3ca35537f8 100644 --- a/clang/test/CoverageMapping/while.c +++ b/clang/test/CoverageMapping/while.c @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s +// RUN: %strip_comments > %t.stripped.c +// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %t.stripped.c | FileCheck %s - // CHECK: main +// CHECK: main int main() { // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+8]]:2 = #0 int j = 0; // CHECK-NEXT: File 0, [[@LINE+1]]:9 -> [[@LINE+1]]:14 = (#0 + #1) while(j < 5) ++j; // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE]]:16 = #1 diff --git a/clang/test/Driver/Inputs/resource_dir/lib/darwin/libclang_rt.iossim.a b/clang/test/Driver/Inputs/resource_dir/lib/darwin/libclang_rt.iossim.a new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Xlinker-args.c b/clang/test/Driver/Xlinker-args.c index 72c2fd2698e88..a44957cd8aef1 100644 --- a/clang/test/Driver/Xlinker-args.c +++ b/clang/test/Driver/Xlinker-args.c @@ -8,13 +8,13 @@ /// -T is reordered to the last to make sure -L takes precedence. // RUN: %clang -target x86_64-pc-linux-gnu -### \ -// RUN: -T a.lds -Xlinker one -Xlinker --no-demangle \ +// RUN: -e _start -T a.lds -Xlinker one -Xlinker --no-demangle \ // RUN: -Wl,two,--no-demangle,three -Xlinker four -z five -r %s 2> %t // RUN: FileCheck -check-prefix=LINUX < %t %s // // DARWIN-NOT: --no-demangle // DARWIN: "one" "two" "three" "four" "-z" "five" "-r" -// LINUX: "--no-demangle" "one" "two" "three" "four" "-z" "five" "-r" {{.*}} "-T" "a.lds" +// LINUX: "--no-demangle" "-e" "_start" "one" "two" "three" "four" "-z" "five" "-r" {{.*}} "-T" "a.lds" // Check that we forward '-Xlinker' and '-Wl,' on Windows. // RUN: %clang -target i686-pc-win32 -### \ diff --git a/clang/test/Driver/aix-as.c b/clang/test/Driver/aix-as.c index cb3053f5acd31..aa8c610359037 100644 --- a/clang/test/Driver/aix-as.c +++ b/clang/test/Driver/aix-as.c @@ -9,7 +9,6 @@ // CHECK-AS32: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0" // CHECK-AS32: "{{.*}}as{{(.exe)?}}" // CHECK-AS32: "-a32" -// CHECK-AS32: "-u" // CHECK-AS32: "-many" // Check powerpc64-ibm-aix7.1.0.0, 64-bit. @@ -20,7 +19,6 @@ // CHECK-AS64: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0" // CHECK-AS64: "{{.*}}as{{(.exe)?}}" // CHECK-AS64: "-a64" -// CHECK-AS64: "-u" // CHECK-AS64: "-many" // Check powerpc-ibm-aix7.1.0.0, 32-bit. -Xassembler option. @@ -32,7 +30,6 @@ // CHECK-AS32-Xassembler: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0" // CHECK-AS32-Xassembler: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-Xassembler: "-a32" -// CHECK-AS32-Xassembler: "-u" // CHECK-AS32-Xassembler: "-many" // CHECK-AS32-Xassembler: "-w" @@ -45,7 +42,6 @@ // CHECK-AS64-Wa: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0" // CHECK-AS64-Wa: "{{.*}}as{{(.exe)?}}" // CHECK-AS64-Wa: "-a64" -// CHECK-AS64-Wa: "-u" // CHECK-AS64-Wa: "-many" // CHECK-AS64-Wa: "-v" // CHECK-AS64-Wa: "-w" @@ -60,13 +56,10 @@ // CHECK-AS32-MultiInput-NOT: warning: // CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-MultiInput: "-a32" -// CHECK-AS32-MultiInput: "-u" // CHECK-AS32-MultiInput: "-many" // CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-MultiInput: "-a32" -// CHECK-AS32-MultiInput: "-u" // CHECK-AS32-MultiInput: "-many" // CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}" // CHECK-AS32-MultiInput: "-a32" -// CHECK-AS32-MultiInput: "-u" // CHECK-AS32-MultiInput: "-many" diff --git a/clang/test/Driver/arm-nofp-disabled-features.c b/clang/test/Driver/arm-nofp-disabled-features.c new file mode 100644 index 0000000000000..432e4a98cffc6 --- /dev/null +++ b/clang/test/Driver/arm-nofp-disabled-features.c @@ -0,0 +1,18 @@ +// RUN: %clang -target arm-arm-none-eabi -mfloat-abi=soft %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MFLOAT-ABI-SOFT +// CHECK-MFLOAT-ABI-SOFT: "-target-feature" "-dotprod" +// CHECK-MFLOAT-ABI-SOFT: "-target-feature" "-fp16fml" +// CHECK-MFLOAT-ABI-SOFT: "-target-feature" "-bf16" +// CHECK-MFLOAT-ABI-SOFT: "-target-feature" "-mve" +// CHECK-MFLOAT-ABI-SOFT: "-target-feature" "-mve.fp" +// CHECK-MFLOAT-ABI-SOFT: "-target-feature" "-fpregs" + +// RUN: %clang -target arm-arm-none-eabi -mfpu=none %s -### 2>&1 | FileCheck %s +// RUN: %clang -target arm-arm-none-eabi -march=armv8-a+nofp %s -### 2>&1 | FileCheck %s +// RUN: %clang -target arm-arm-none-eabi -mcpu=cortex-a35+nofp %s -### 2>&1 | FileCheck %s +// RUN: %clang -target arm-arm-none-eabi -march=armv8-a+nofp+nomve %s -### 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMVE +// RUN: %clang -target arm-arm-none-eabi -mcpu=cortex-a35+nofp+nomve %s -### 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMVE +// CHECK: "-target-feature" "-dotprod" +// CHECK: "-target-feature" "-fp16fml" +// CHECK: "-target-feature" "-bf16" +// CHECK: "-target-feature" "-mve.fp" +// CHECK-NOMVE: "-target-feature" "-fpregs" diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 0dcaf61088069..d0c48ae41d9a2 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -524,11 +524,11 @@ // RUN: %clang_cl /Zi /c -### -- %s 2>&1 | FileCheck -check-prefix=Zi %s // Zi: "-gcodeview" -// Zi: "-debug-info-kind=constructor" +// Zi: "-debug-info-kind=limited" // RUN: %clang_cl /Z7 /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7 %s // Z7: "-gcodeview" -// Z7: "-debug-info-kind=constructor" +// Z7: "-debug-info-kind=limited" // RUN: %clang_cl /Zd /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7GMLT %s // Z7GMLT: "-gcodeview" @@ -557,7 +557,7 @@ // which made it "win". This test could not detect that bug. // RUN: %clang_cl /Z7 -gdwarf /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7_gdwarf %s // Z7_gdwarf: "-gcodeview" -// Z7_gdwarf: "-debug-info-kind=constructor" +// Z7_gdwarf: "-debug-info-kind=limited" // Z7_gdwarf: "-dwarf-version=4" // RUN: %clang_cl -fmsc-version=1800 -TP -### -- %s 2>&1 | FileCheck -check-prefix=CXX11 %s diff --git a/clang/test/Driver/clang-g-opts.c b/clang/test/Driver/clang-g-opts.c index 60c97790b7dae..bc714b6c93791 100644 --- a/clang/test/Driver/clang-g-opts.c +++ b/clang/test/Driver/clang-g-opts.c @@ -31,7 +31,7 @@ // RUN: | FileCheck --check-prefix=CHECK-WITH-G-DWARF2 %s // CHECK-WITHOUT-G-NOT: -debug-info-kind -// CHECK-WITH-G: "-debug-info-kind=constructor" +// CHECK-WITH-G: "-debug-info-kind=limited" // CHECK-WITH-G: "-dwarf-version=4" // CHECK-WITH-G-DWARF2: "-dwarf-version=2" diff --git a/clang/test/Driver/clang-translation.c b/clang/test/Driver/clang-translation.c index 2f02970a2a8ee..d1daeb80004b7 100644 --- a/clang/test/Driver/clang-translation.c +++ b/clang/test/Driver/clang-translation.c @@ -167,12 +167,6 @@ // PPCPWR8: "-cc1" // PPCPWR8: "-target-cpu" "pwr8" -// RUN: %clang -target powerpc64-unknown-linux-gnu \ -// RUN: -### -S %s -mcpu=a2q 2>&1 | FileCheck -check-prefix=PPCA2Q %s -// PPCA2Q: clang -// PPCA2Q: "-cc1" -// PPCA2Q: "-target-cpu" "a2q" - // RUN: %clang -target powerpc64-unknown-linux-gnu \ // RUN: -### -S %s -mcpu=630 2>&1 | FileCheck -check-prefix=PPC630 %s // PPC630: clang diff --git a/clang/test/Driver/cuda-dwarf-2.cu b/clang/test/Driver/cuda-dwarf-2.cu index 92b8919729fc4..bcfb2444bc516 100644 --- a/clang/test/Driver/cuda-dwarf-2.cu +++ b/clang/test/Driver/cuda-dwarf-2.cu @@ -49,7 +49,7 @@ // HAS_DEBUG-NOT: warning: debug // HAS_DEBUG: "-fcuda-is-device" -// HAS_DEBUG-SAME: "-debug-info-kind={{constructor|line-tables-only}}" +// HAS_DEBUG-SAME: "-debug-info-kind={{limited|line-tables-only}}" // HAS_DEBUG-SAME: "-dwarf-version=2" // HAS_DEBUG: ptxas // HAS_DEBUG-SAME: "-g" diff --git a/clang/test/Driver/darwin-dsymutil.c b/clang/test/Driver/darwin-dsymutil.c index 09451a81b797d..8cdb2f3cbf644 100644 --- a/clang/test/Driver/darwin-dsymutil.c +++ b/clang/test/Driver/darwin-dsymutil.c @@ -26,10 +26,21 @@ // // RUN: %clang -target x86_64-apple-darwin10 -ccc-print-bindings \ // RUN: -o foo %s -g 2> %t -// RUN: FileCheck -check-prefix=CHECK-OUTPUT-NAME < %t %s +// RUN: FileCheck -Doutfile=foo -Ddsymfile=foo.dSYM \ +// RUN: -check-prefix=CHECK-OUTPUT-NAME < %t %s // -// CHECK-OUTPUT-NAME: "x86_64-apple-darwin10" - "darwin::Linker", inputs: [{{.*}}], output: "foo" -// CHECK-OUTPUT-NAME: "x86_64-apple-darwin10" - "darwin::Dsymutil", inputs: ["foo"], output: "foo.dSYM" +// RUN: %clang -target x86_64-apple-darwin10 -ccc-print-bindings \ +// RUN: -o bar/foo %s -g 2> %t +// RUN: FileCheck -Doutfile=bar/foo -Ddsymfile=bar/foo.dSYM \ +// RUN: -check-prefix=CHECK-OUTPUT-NAME < %t %s +// +// RUN: %clang -target x86_64-apple-darwin10 -ccc-print-bindings \ +// RUN: -o bar/foo -dsym-dir external %s -g 2> %t +// RUN: FileCheck -Doutfile=bar/foo -Ddsymfile=external/foo.dSYM \ +// RUN: -check-prefix=CHECK-OUTPUT-NAME < %t %s +// +// CHECK-OUTPUT-NAME: "x86_64-apple-darwin10" - "darwin::Linker", inputs: [{{.*}}], output: "[[outfile]]" +// CHECK-OUTPUT-NAME: "x86_64-apple-darwin10" - "darwin::Dsymutil", inputs: ["[[outfile]]"], output: "[[dsymfile]]" // Check that we only use dsymutil when needed. // @@ -38,12 +49,5 @@ // RUN: -o foo %t.o -g 2> %t // RUN: not grep "Dsymutil" %t -// Check that we put the .dSYM in the right place. -// RUN: %clang -target x86_64-apple-darwin10 -ccc-print-bindings \ -// RUN: -o bar/foo %s -g 2> %t -// RUN: FileCheck -check-prefix=CHECK-LOCATION < %t %s - -// CHECK-LOCATION: "x86_64-apple-darwin10" - "darwin::Dsymutil", inputs: ["bar/foo"], output: "bar/foo.dSYM" - // Check that we don't crash when translating arguments for dsymutil. // RUN: %clang -m32 -arch x86_64 -g %s -### diff --git a/clang/test/Driver/darwin-ld.c b/clang/test/Driver/darwin-ld.c index ea71142e88c19..82cd36d59f80d 100644 --- a/clang/test/Driver/darwin-ld.c +++ b/clang/test/Driver/darwin-ld.c @@ -156,7 +156,7 @@ // RUN: FileCheck -check-prefix=LINK_IOSSIM_PROFILE %s < %t.log // LINK_IOSSIM_PROFILE: {{ld(.exe)?"}} // LINK_IOSSIM_PROFILE: libclang_rt.profile_iossim.a -// LINK_IOSSIM_PROFILE: libclang_rt.ios.a +// LINK_IOSSIM_PROFILE: libclang_rt.iossim.a // RUN: %clang -target arm64-apple-tvos8.3 -mlinker-version=400 -mtvos-version-min=8.3 -resource-dir=%S/Inputs/resource_dir -### %t.o 2> %t.log // RUN: FileCheck -check-prefix=LINK_TVOS_ARM64 %s < %t.log diff --git a/clang/test/Driver/debug-options-as.c b/clang/test/Driver/debug-options-as.c index 4808219702e76..51475680e9b18 100644 --- a/clang/test/Driver/debug-options-as.c +++ b/clang/test/Driver/debug-options-as.c @@ -23,7 +23,7 @@ // RUN: | FileCheck %s // // CHECK: "-cc1as" -// CHECK: "-debug-info-kind=constructor" +// CHECK: "-debug-info-kind=limited" // Check to make sure clang with -g on a .s file gets passed -dwarf-debug-producer. // rdar://12955296 diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c index 2d1a0b2d5cd8f..189c1f9addeb9 100644 --- a/clang/test/Driver/debug-options.c +++ b/clang/test/Driver/debug-options.c @@ -274,18 +274,18 @@ // GLIO_ONLY_DWARF2: "-dwarf-version=2" // // G_ONLY: "-cc1" -// G_ONLY: "-debug-info-kind=constructor" +// G_ONLY: "-debug-info-kind=limited" // // These tests assert that "-gline-tables-only" "-g" uses the latter, // but otherwise not caring about the DebugInfoKind. // G_ONLY_DWARF2: "-cc1" -// G_ONLY_DWARF2: "-debug-info-kind={{standalone|constructor}}" +// G_ONLY_DWARF2: "-debug-info-kind={{standalone|limited}}" // G_ONLY_DWARF2: "-dwarf-version=2" // // G_STANDALONE: "-cc1" // G_STANDALONE: "-debug-info-kind=standalone" // G_LIMITED: "-cc1" -// G_LIMITED: "-debug-info-kind=constructor" +// G_LIMITED: "-debug-info-kind=limited" // G_DWARF2: "-dwarf-version=2" // G_DWARF4: "-dwarf-version=4" // @@ -339,7 +339,7 @@ // NOCI: "-gno-column-info" // // GEXTREFS: "-dwarf-ext-refs" "-fmodule-format=obj" -// GEXTREFS: "-debug-info-kind={{standalone|constructor}}" +// GEXTREFS: "-debug-info-kind={{standalone|limited}}" // RUN: not %clang -cc1 -debug-info-kind=watkind 2>&1 | FileCheck -check-prefix=BADSTRING1 %s // BADSTRING1: error: invalid value 'watkind' in '-debug-info-kind=watkind' diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index 7340bfb35e40c..cfefd3fb632cb 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -458,6 +458,10 @@ // RUN: %clang -target x86_64-apple-darwin -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-DARWIN // CHECK-TSAN-X86-64-DARWIN-NOT: unsupported option +// RUN: %clang -target x86_64-apple-macos -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-MACOS +// CHECK-TSAN-X86-64-MACOS-NOT: unsupported option +// RUN: %clang -target arm64-apple-macos -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-ARM64-MACOS +// CHECK-TSAN-ARM64-MACOS-NOT: unsupported option // RUN: %clang -target x86_64-apple-iossimulator -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-IOSSIMULATOR // CHECK-TSAN-X86-64-IOSSIMULATOR-NOT: unsupported option diff --git a/clang/test/Driver/gcc_forward.c b/clang/test/Driver/gcc_forward.c index f75b1c738b05c..a99944f8f5336 100644 --- a/clang/test/Driver/gcc_forward.c +++ b/clang/test/Driver/gcc_forward.c @@ -1,3 +1,8 @@ +// RUN: %clang -### %s -target aarch64-none-elf \ +// RUN: --coverage -e _start -fuse-ld=lld --ld-path=ld -nostdlib -r -rdynamic -static -static-pie \ +// RUN: 2>&1 | FileCheck --check-prefix=FORWARD %s +// FORWARD: gcc{{[^"]*}}" "--coverage" "-fuse-ld=lld" "--ld-path=ld" "-nostdlib" "-rdynamic" "-static" "-static-pie" "-o" "a.out" "{{.*}}.o" "-e" "_start" "-r" + // Check that we don't try to forward -Xclang or -mlinker-version to GCC. // PR12920 -- Check also we may not forward W_Group options to GCC. // @@ -5,7 +10,7 @@ // RUN: %s \ // RUN: -Wall -Wdocumentation \ // RUN: -Xclang foo-bar \ -// RUN: -march=x86-64 \ +// RUN: -pie -march=x86-64 \ // RUN: -mlinker-version=10 -### 2> %t // RUN: FileCheck < %t %s // @@ -15,13 +20,13 @@ // CHECK: "-o" "{{[^"]+}}.o" // // gcc as ld. -// CHECK: gcc{{[^"]*}}" +// CHECK: gcc{{[^"]*}}" "-pie" // CHECK-NOT: "-mlinker-version=10" // CHECK-NOT: "-Xclang" // CHECK-NOT: "foo-bar" // CHECK-NOT: "-Wall" // CHECK-NOT: "-Wdocumentation" -// CHECK: -march +// CHECK-NOT: -march // CHECK-NOT: "-mlinker-version=10" // CHECK-NOT: "-Xclang" // CHECK-NOT: "foo-bar" @@ -34,9 +39,3 @@ // RUN: | FileCheck --check-prefix=CHECK-ASM %s // CHECK-ASM: as // CHECK-ASM-NOT: "-g" - -// Check that we're not forwarding -mno-unaligned-access. -// RUN: %clang -target aarch64-none-elf -mno-unaligned-access %s -### 2>&1 \ -// RUN: | FileCheck --check-prefix=CHECK-ARM %s -// CHECK-ARM: gcc{{[^"]*}}" -// CHECK-ARM-NOT: -mno-unaligned-access diff --git a/clang/test/Driver/gfortran.f90 b/clang/test/Driver/gfortran.f90 index d2f90b47a3902..6f972cc333ae0 100644 --- a/clang/test/Driver/gfortran.f90 +++ b/clang/test/Driver/gfortran.f90 @@ -1,3 +1,4 @@ +! XFAIL: * ! Test that Clang can forward all of the flags which are documented as ! being supported by gfortran to GCC when falling back to GCC for ! a fortran input file. diff --git a/clang/test/Driver/integrated-as.s b/clang/test/Driver/integrated-as.s index 05999cfe002b5..0194a3d5a4382 100644 --- a/clang/test/Driver/integrated-as.s +++ b/clang/test/Driver/integrated-as.s @@ -27,19 +27,19 @@ // XA_INCLUDE2: "-Ifoo_dir" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-4 -gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2 %s -// DWARF2: "-debug-info-kind=constructor" "-dwarf-version=2" +// DWARF2: "-debug-info-kind=limited" "-dwarf-version=2" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-3 2>&1 | FileCheck --check-prefix=DWARF3 %s -// DWARF3: "-debug-info-kind=constructor" "-dwarf-version=3" +// DWARF3: "-debug-info-kind=limited" "-dwarf-version=3" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-4 2>&1 | FileCheck --check-prefix=DWARF4 %s -// DWARF4: "-debug-info-kind=constructor" "-dwarf-version=4" +// DWARF4: "-debug-info-kind=limited" "-dwarf-version=4" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Xassembler -gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2XASSEMBLER %s -// DWARF2XASSEMBLER: "-debug-info-kind=constructor" "-dwarf-version=2" +// DWARF2XASSEMBLER: "-debug-info-kind=limited" "-dwarf-version=2" // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Wa,-gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2WA %s -// DWARF2WA: "-debug-info-kind=constructor" "-dwarf-version=2" +// DWARF2WA: "-debug-info-kind=limited" "-dwarf-version=2" // A dwarf version number that driver can't parse is just stuffed in. // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Wa,-gdwarf-huh 2>&1 | FileCheck --check-prefix=BOGODWARF %s diff --git a/clang/test/Driver/myriad-toolchain.c b/clang/test/Driver/myriad-toolchain.c index a4bd260a14986..215a02fd0dec1 100644 --- a/clang/test/Driver/myriad-toolchain.c +++ b/clang/test/Driver/myriad-toolchain.c @@ -83,7 +83,7 @@ // NOSTDLIB-NOT: "-lc" // RUN: %clang -### -c -g %s -target sparc-myriad 2>&1 | FileCheck -check-prefix=G_SPARC %s -// G_SPARC: "-debug-info-kind=constructor" "-dwarf-version=2" +// G_SPARC: "-debug-info-kind=limited" "-dwarf-version=2" // RUN: %clang -### -c %s -target sparc-myriad-rtems -fuse-init-array 2>&1 \ // RUN: | FileCheck -check-prefix=USE-INIT-ARRAY %s diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c index 3ddd6446d1176..6415f1d61b720 100644 --- a/clang/test/Driver/openmp-offload-gpu.c +++ b/clang/test/Driver/openmp-offload-gpu.c @@ -241,7 +241,7 @@ // HAS_DEBUG-NOT: warning: debug // HAS_DEBUG: "-triple" "nvptx64-nvidia-cuda" -// HAS_DEBUG-SAME: "-debug-info-kind={{constructor|line-tables-only}}" +// HAS_DEBUG-SAME: "-debug-info-kind={{limited|line-tables-only}}" // HAS_DEBUG-SAME: "-dwarf-version=2" // HAS_DEBUG-SAME: "-fopenmp-is-device" // HAS_DEBUG: ptxas diff --git a/clang/test/Driver/ppc-abi.c b/clang/test/Driver/ppc-abi.c index acc4981a2eee6..2b5cc463e7c3d 100644 --- a/clang/test/Driver/ppc-abi.c +++ b/clang/test/Driver/ppc-abi.c @@ -5,14 +5,6 @@ // RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv1 | FileCheck -check-prefix=CHECK-ELFv1 %s // RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mabi=elfv1-qpx | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2 -mqpx | FileCheck -check-prefix=CHECK-ELFv1-QPX %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q -mno-qpx | FileCheck -check-prefix=CHECK-ELFv1 %s -// RUN: %clang -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv2 | FileCheck -check-prefix=CHECK-ELFv2-BE %s // RUN: %clang -target powerpc64le-unknown-linux-gnu %s -### -o %t.o 2>&1 \ @@ -34,8 +26,6 @@ // CHECK-ELFv1: "-target-abi" "elfv1" // CHECK-ELFv1-LE: "-mrelocation-model" "static" // CHECK-ELFv1-LE: "-target-abi" "elfv1" -// CHECK-ELFv1-QPX: "-mrelocation-model" "static" -// CHECK-ELFv1-QPX: "-target-abi" "elfv1-qpx" // CHECK-ELFv2: "-mrelocation-model" "static" // CHECK-ELFv2: "-target-abi" "elfv2" // CHECK-ELFv2-BE: "-mrelocation-model" "static" @@ -48,14 +38,6 @@ // RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv1 | FileCheck -check-prefix=CHECK-ELFv1-PIC %s // RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mabi=elfv1-qpx | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2 -mqpx | FileCheck -check-prefix=CHECK-ELFv1-QPX-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ -// RUN: -mcpu=a2q -mno-qpx | FileCheck -check-prefix=CHECK-ELFv1-PIC %s -// RUN: %clang -fPIC -target powerpc64-unknown-linux-gnu %s -### -o %t.o 2>&1 \ // RUN: -mabi=elfv2 | FileCheck -check-prefix=CHECK-ELFv2-PIC %s // RUN: %clang -fPIC -target powerpc64le-unknown-linux-gnu %s -### -o %t.o 2>&1 \ @@ -69,8 +51,6 @@ // CHECK-ELFv1-PIC: "-mrelocation-model" "pic" "-pic-level" "2" // CHECK-ELFv1-PIC: "-target-abi" "elfv1" -// CHECK-ELFv1-QPX-PIC: "-mrelocation-model" "pic" "-pic-level" "2" -// CHECK-ELFv1-QPX-PIC: "-target-abi" "elfv1-qpx" // CHECK-ELFv2-PIC: "-mrelocation-model" "pic" "-pic-level" "2" // CHECK-ELFv2-PIC: "-target-abi" "elfv2" diff --git a/clang/test/Driver/ppc-dependent-options.cpp b/clang/test/Driver/ppc-dependent-options.cpp index 05e4b5b10fc49..1c1a0c38cdcb0 100644 --- a/clang/test/Driver/ppc-dependent-options.cpp +++ b/clang/test/Driver/ppc-dependent-options.cpp @@ -54,6 +54,10 @@ // RUN: -mcpu=power9 -std=c++11 -mno-vsx -mfloat128 %s 2>&1 | \ // RUN: FileCheck %s -check-prefix=CHECK-NVSX-FLT128 +// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \ +// RUN: -mcpu=power10 -std=c++11 -mno-vsx -mpaired-vector-memops %s 2>&1 | \ +// RUN: FileCheck %s -check-prefix=CHECK-NVSX-PAIRED-VEC-MEMOPS + // RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \ // RUN: -mcpu=power9 -std=c++11 -mno-vsx -mfloat128 -mpower9-vector %s 2>&1 | \ // RUN: FileCheck %s -check-prefix=CHECK-NVSX-MULTI @@ -96,6 +100,7 @@ static_assert(false, "Neither enabled"); // CHECK-NVSX-P10V: error: option '-mpower10-vector' cannot be specified with '-mno-vsx' // CHECK-NVSX-FLT128: error: option '-mfloat128' cannot be specified with '-mno-vsx' // CHECK-NVSX-DMV: error: option '-mdirect-move' cannot be specified with '-mno-vsx' +// CHECK-NVSX-PAIRED-VEC-MEMOPS: error: option '-mpaired-vector-memops' cannot be specified with '-mno-vsx' // CHECK-NVSX-MULTI: error: option '-mfloat128' cannot be specified with '-mno-vsx' // CHECK-NVSX-MULTI: error: option '-mpower9-vector' cannot be specified with '-mno-vsx' // CHECK-NVSX: Neither enabled diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c index 725201a77ba7c..8b630b1846c97 100644 --- a/clang/test/Driver/riscv-arch.c +++ b/clang/test/Driver/riscv-arch.c @@ -380,6 +380,6 @@ // RV32-EXPERIMENTAL-V-BADVERS: error: invalid arch name 'rv32iv0p1' // RV32-EXPERIMENTAL-V-BADVERS: unsupported version number 0.1 for experimental extension -// RUN: %clang -target riscv32-unknown-elf -march=rv32iv0p8 -menable-experimental-extensions -### %s -c 2>&1 | \ +// RUN: %clang -target riscv32-unknown-elf -march=rv32iv0p9 -menable-experimental-extensions -### %s -c 2>&1 | \ // RUN: FileCheck -check-prefix=RV32-EXPERIMENTAL-V-GOODVERS %s // RV32-EXPERIMENTAL-V-GOODVERS: "-target-feature" "+experimental-v" diff --git a/clang/test/Driver/split-debug.c b/clang/test/Driver/split-debug.c index 70f8d91d48e01..d40207d5ae3b6 100644 --- a/clang/test/Driver/split-debug.c +++ b/clang/test/Driver/split-debug.c @@ -68,18 +68,18 @@ // RUN: FileCheck -check-prefix=CHECK-NOINLINE-WITHOUT-SPLIT < %t %s // // CHECK-NOINLINE-WITHOUT-SPLIT: "-fno-split-dwarf-inlining" -// CHECK-NOINLINE-WITHOUT-SPLIT: "-debug-info-kind=constructor" +// CHECK-NOINLINE-WITHOUT-SPLIT: "-debug-info-kind=limited" // RUN: %clang -target x86_64-unknown-linux-gnu -gmlt -gsplit-dwarf -fno-split-dwarf-inlining -S -### %s 2> %t // RUN: FileCheck -check-prefix=CHECK-SPLIT-WITH-GMLT < %t %s // -// CHECK-SPLIT-WITH-GMLT: "-debug-info-kind=constructor" +// CHECK-SPLIT-WITH-GMLT: "-debug-info-kind=limited" // CHECK-SPLIT-WITH-GMLT: "-split-dwarf-output" // RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -fno-split-dwarf-inlining -S -### %s 2> %t // RUN: FileCheck -check-prefix=CHECK-SPLIT-WITH-NOINL < %t %s // -// CHECK-SPLIT-WITH-NOINL: "-debug-info-kind=constructor" +// CHECK-SPLIT-WITH-NOINL: "-debug-info-kind=limited" // CHECK-SPLIT-WITH-NOINL: "-split-dwarf-output" // RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -gmlt -fsplit-dwarf-inlining -S -### %s 2> %t @@ -92,7 +92,7 @@ // RUN: %clang -target x86_64-unknown-linux-gnu -gmlt -gsplit-dwarf -S -### %s 2> %t // RUN: FileCheck -check-prefix=CHECK-SPLIT-OVER-GMLT < %t %s // -// CHECK-SPLIT-OVER-GMLT: "-debug-info-kind=constructor" +// CHECK-SPLIT-OVER-GMLT: "-debug-info-kind=limited" // CHECK-SPLIT-OVER-GMLT: "-split-dwarf-file" // CHECK-SPLIT-OVER-GMLT: "-split-dwarf-output" @@ -117,6 +117,6 @@ // RUN: %clang -target x86_64-unknown-linux-gnu -g0 -gsplit-dwarf=split -S -### %s 2> %t // RUN: FileCheck -check-prefix=CHECK-SPLIT-OVER-G0 < %t %s // -// CHECK-SPLIT-OVER-G0: "-debug-info-kind=constructor" +// CHECK-SPLIT-OVER-G0: "-debug-info-kind=limited" // CHECK-SPLIT-OVER-G0: "-split-dwarf-file" // CHECK-SPLIT-OVER-G0: "-split-dwarf-output" diff --git a/clang/test/Layout/aix-Wpacked-expecting-diagnostics.cpp b/clang/test/Layout/aix-Wpacked-expecting-diagnostics.cpp new file mode 100644 index 0000000000000..1980c04877c51 --- /dev/null +++ b/clang/test/Layout/aix-Wpacked-expecting-diagnostics.cpp @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -Wpacked \ +// RUN: -fdump-record-layouts -fsyntax-only -verify -x c++ < %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -Wpacked \ +// RUN: -fdump-record-layouts -fsyntax-only -verify -x c++ < %s | \ +// RUN: FileCheck %s + +struct A { + double d; +}; + +struct B { + char x[8]; +}; + +struct [[gnu::packed]] C : B, A { // expected-warning{{packed attribute is unnecessary for 'C'}} + char x alignas(4)[8]; +}; + +int b = sizeof(C); + +// CHECK: 0 | struct C +// CHECK-NEXT: 0 | struct B (base) +// CHECK-NEXT: 0 | char [8] x +// CHECK-NEXT: 8 | struct A (base) +// CHECK-NEXT: 8 | double d +// CHECK-NEXT: 16 | char [8] x +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=4] diff --git a/clang/test/Layout/aix-Wpacked-no-diagnostics.cpp b/clang/test/Layout/aix-Wpacked-no-diagnostics.cpp new file mode 100644 index 0000000000000..ed5362b3e8cd9 --- /dev/null +++ b/clang/test/Layout/aix-Wpacked-no-diagnostics.cpp @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -Wpacked \ +// RUN: -fdump-record-layouts -fsyntax-only -verify -x c++ < %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -Wpacked \ +// RUN: -fdump-record-layouts -fsyntax-only -verify -x c++ < %s | \ +// RUN: FileCheck %s + +// expected-no-diagnostics + +struct [[gnu::packed]] Q { + double x [[gnu::aligned(4)]]; +}; + +struct QQ : Q { char x; }; + +int a = sizeof(QQ); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct Q +// CHECK-NEXT: 0 | double x +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct QQ +// CHECK-NEXT: 0 | struct Q (base) +// CHECK-NEXT: 0 | double x +// CHECK-NEXT: 8 | char x +// CHECK-NEXT: | [sizeof=12, dsize=9, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=9, nvalign=4, preferrednvalign=4] diff --git a/clang/test/Layout/aix-double-struct-member.cpp b/clang/test/Layout/aix-double-struct-member.cpp new file mode 100644 index 0000000000000..b51d10467bceb --- /dev/null +++ b/clang/test/Layout/aix-double-struct-member.cpp @@ -0,0 +1,428 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck %s + +namespace test1 { +// Test the class layout when having a double which is/is not the first struct +// member. +struct D { + double d1; + int i1; +}; + +struct DoubleFirst { + struct D d2; + int i2; +}; + +struct IntFirst { + int i3; + struct D d3; +}; + +int a = sizeof(DoubleFirst); +int b = sizeof(IntFirst); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::D +// CHECK-NEXT: 0 | double d1 +// CHECK-NEXT: 8 | int i1 +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::DoubleFirst +// CHECK-NEXT: 0 | struct test1::D d2 +// CHECK-NEXT: 0 | double d1 +// CHECK-NEXT: 8 | int i1 +// CHECK-NEXT: 16 | int i2 +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::IntFirst +// CHECK-NEXT: 0 | int i3 +// CHECK-NEXT: 4 | struct test1::D d3 +// CHECK-NEXT: 4 | double d1 +// CHECK-NEXT: 12 | int i1 +// CHECK-NEXT: | [sizeof=20, dsize=20, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=20, nvalign=4, preferrednvalign=4] +} // namespace test1 + +namespace test2 { +// Test the class layout when having a zero-sized bitfield followed by double. +struct Double { + int : 0; + double d; +}; + +int a = sizeof(Double); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test2::Double +// CHECK-NEXT: 0:- | int +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=4] +} // namespace test2 + +namespace test3 { +// Test the class layout when having a double member in union. +union A { + int b; + double d; +}; + +struct UnionStruct { + union A a; + int i; +}; + +int a = sizeof(UnionStruct); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | union test3::A +// CHECK-NEXT: 0 | int b +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test3::UnionStruct +// CHECK-NEXT: 0 | union test3::A a +// CHECK-NEXT: 0 | int b +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | int i +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +} // namespace test3 + +namespace test4 { +// Test the class layout when having multiple base classes. +struct A { + int a; +}; + +struct B { + double d; +}; + +class S : A, B { +}; + +int a = sizeof(S); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test4::A +// CHECK-NEXT: 0 | int a +// CHECK-NEXT: | [sizeof=4, dsize=4, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=4, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test4::B +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | class test4::S +// CHECK-NEXT: 0 | struct test4::A (base) +// CHECK-NEXT: 0 | int a +// CHECK-NEXT: 4 | struct test4::B (base) +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: | [sizeof=12, dsize=12, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=12, nvalign=4, preferrednvalign=4] +} // namespace test4 + +namespace test5 { +struct Empty { +}; + +struct EmptyDer : Empty { + double d; +}; + +struct NonEmpty { + int i; +}; + +struct NonEmptyDer : NonEmpty { + double d; +}; + +int a = sizeof(EmptyDer); +int b = sizeof(NonEmptyDer); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test5::Empty (empty) +// CHECK-NEXT: | [sizeof=1, dsize=1, align=1, preferredalign=1, +// CHECK-NEXT: | nvsize=1, nvalign=1, preferrednvalign=1] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test5::EmptyDer +// CHECK-NEXT: 0 | struct test5::Empty (base) (empty) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test5::NonEmpty +// CHECK-NEXT: 0 | int i +// CHECK-NEXT: | [sizeof=4, dsize=4, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=4, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test5::NonEmptyDer +// CHECK-NEXT: 0 | struct test5::NonEmpty (base) +// CHECK-NEXT: 0 | int i +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: | [sizeof=12, dsize=12, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=12, nvalign=4, preferrednvalign=4] +} // namespace test5 + +namespace test6 { +struct A { + struct B { + double d[3]; + } b; +}; + +int a = sizeof(A); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test6::A::B +// CHECK-NEXT: 0 | double [3] d +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test6::A +// CHECK-NEXT: 0 | struct test6::A::B b +// CHECK-NEXT: 0 | double [3] d +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=8] + +} // namespace test6 + +namespace test7 { +struct A { + struct B { + long double _Complex d[3]; + } b; +}; + +int a = sizeof(A); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test7::A::B +// CHECK-NEXT: 0 | _Complex long double [3] d +// CHECK-NEXT: | [sizeof=48, dsize=48, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=48, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test7::A +// CHECK-NEXT: 0 | struct test7::A::B b +// CHECK-NEXT: 0 | _Complex long double [3] d +// CHECK-NEXT: | [sizeof=48, dsize=48, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=48, nvalign=4, preferrednvalign=8] + +} // namespace test7 + +namespace test8 { +struct Emp {}; + +struct Y : Emp { + double d; +}; + +struct Z : Emp { + Y y; +}; + +int a = sizeof(Z); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test8::Emp (empty) +// CHECK-NEXT: | [sizeof=1, dsize=1, align=1, preferredalign=1, +// CHECK-NEXT: | nvsize=1, nvalign=1, preferrednvalign=1] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test8::Y +// CHECK-NEXT: 0 | struct test8::Emp (base) (empty) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test8::Z +// CHECK-NEXT: 0 | struct test8::Emp (base) (empty) +// CHECK-NEXT: 8 | struct test8::Y y +// CHECK-NEXT: 8 | struct test8::Emp (base) (empty) +// CHECK-NEXT: 8 | double d +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +} // namespace test8 + +namespace test9 { +// Test the class layout when having a zero-extent array in a base class, which +// renders the base class not empty. +struct A { char zea[0]; }; + +struct B : A { double d; }; + +struct C { double d; }; +struct D : A, C { char x; }; + +int a = sizeof(B); +int b = sizeof(D); + +// CHECK: 0 | struct test9::B +// CHECK-NEXT: 0 | struct test9::A (base) +// CHECK-NEXT: 0 | char [0] zea +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=4] + +// CHECK: 0 | struct test9::D +// CHECK-NEXT: 0 | struct test9::A (base) +// CHECK-NEXT: 0 | char [0] zea +// CHECK-NEXT: 0 | struct test9::C (base) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | char x +// CHECK-NEXT: | [sizeof=12, dsize=9, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=9, nvalign=4, preferrednvalign=4] + +} // namespace test9 + +namespace test10 { +struct A { double x; }; +struct B : A {}; + +int a = sizeof(B); + +// CHECK: 0 | struct test10::B +// CHECK-NEXT: 0 | struct test10::A (base) +// CHECK-NEXT: 0 | double x +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +} // namespace test10 + +namespace test11 { +// Test how #pragma pack and align attribute interacts with AIX `power` +// alignment rules. +struct A { + char a; + double __attribute__((aligned(16))) d; + int i; +}; + +struct B { + double __attribute__((aligned(4))) d1; + char a; + double d2; +}; + +#pragma pack(2) +struct C { + int i; + short j; + double k; +}; + +#pragma pack(2) +struct D { + double d; + short j; + int i; +}; + +#pragma pack(8) +struct E { + double __attribute__((aligned(4))) d; + short s; +}; + +#pragma pack(4) +struct F : public D { + double d; +}; + +#pragma pack(2) +struct G : public E { + int i; +}; + +int a = sizeof(A); +int b = sizeof(B); +int c = sizeof(C); +int d = sizeof(D); +int e = sizeof(E); +int f = sizeof(F); +int g = sizeof(G); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::A +// CHECK-NEXT: 0 | char a +// CHECK-NEXT: 16 | double d +// CHECK-NEXT: 24 | int i +// CHECK-NEXT: | [sizeof=32, dsize=32, align=16, preferredalign=16, +// CHECK-NEXT: | nvsize=32, nvalign=16, preferrednvalign=16] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::B +// CHECK-NEXT: 0 | double d1 +// CHECK-NEXT: 8 | char a +// CHECK-NEXT: 12 | double d2 +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::C +// CHECK-NEXT: 0 | int i +// CHECK-NEXT: 4 | short j +// CHECK-NEXT: 6 | double k +// CHECK-NEXT: | [sizeof=14, dsize=14, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=14, nvalign=2, preferrednvalign=2] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::D +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | short j +// CHECK-NEXT: 10 | int i +// CHECK-NEXT: | [sizeof=14, dsize=14, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=14, nvalign=2, preferrednvalign=2] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::E +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | short s +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::F +// CHECK-NEXT: 0 | struct test11::D (base) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | short j +// CHECK-NEXT: 10 | int i +// CHECK-NEXT: 16 | double d +// CHECK-NEXT: | [sizeof=24, dsize=24, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=24, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test11::G +// CHECK-NEXT: 0 | struct test11::E (base) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | short s +// CHECK-NEXT: 16 | int i +// CHECK-NEXT: | [sizeof=20, dsize=20, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=20, nvalign=2, preferrednvalign=2] + +} // namespace test11 diff --git a/clang/test/Layout/aix-no-unique-address-with-double.cpp b/clang/test/Layout/aix-no-unique-address-with-double.cpp new file mode 100644 index 0000000000000..5188bf128bc50 --- /dev/null +++ b/clang/test/Layout/aix-no-unique-address-with-double.cpp @@ -0,0 +1,158 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck %s + +struct Empty {}; + +struct A { + double d; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct Empty (empty) +// CHECK-NEXT: | [sizeof=1, dsize=1, align=1, preferredalign=1, +// CHECK-NEXT: | nvsize=1, nvalign=1, preferrednvalign=1] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct A +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +struct B { + ~B(); + + Empty emp; + A a; + char c; +}; + +struct B1 { + [[no_unique_address]] B b; + char ext[7]; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct B +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 4 | struct A a +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: 12 | char c +// CHECK-NEXT: | [sizeof=16, dsize=13, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=13, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct B1 +// CHECK-NEXT: 0 | struct B b +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 4 | struct A a +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: 12 | char c +// CHECK-NEXT: 13 | char [7] ext +// CHECK-NEXT: | [sizeof=20, dsize=20, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=20, nvalign=4, preferrednvalign=4] + +struct C { + ~C(); + + [[no_unique_address]] Empty emp; + A a; + char c; +}; + +struct C1 { + [[no_unique_address]] C c; + char ext[7]; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct C +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 0 | struct A a +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | char c +// CHECK-NEXT: | [sizeof=16, dsize=9, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=9, nvalign=4, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct C1 +// CHECK-NEXT: 0 | struct C c +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 0 | struct A a +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: 8 | char c +// CHECK-NEXT: 9 | char [7] ext +// CHECK-NEXT: | [sizeof=16, dsize=16, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=16, nvalign=4, preferrednvalign=8] + +struct D { + ~D(); + + [[no_unique_address]] char notEmp; + A a; + char c; +}; + +struct D1 { + [[no_unique_address]] D d; + char ext[7]; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct D +// CHECK-NEXT: 0 | char notEmp +// CHECK-NEXT: 4 | struct A a +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: 12 | char c +// CHECK-NEXT: | [sizeof=16, dsize=13, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=13, nvalign=4, preferrednvalign=4] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct D1 +// CHECK-NEXT: 0 | struct D d +// CHECK-NEXT: 0 | char notEmp +// CHECK-NEXT: 4 | struct A a +// CHECK-NEXT: 4 | double d +// CHECK-NEXT: 12 | char c +// CHECK-NEXT: 13 | char [7] ext +// CHECK-NEXT: | [sizeof=20, dsize=20, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=20, nvalign=4, preferrednvalign=4] + +struct E { + [[no_unique_address]] Empty emp; + int : 0; + double d; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct E +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 0:- | int +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=4, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=4] + +struct F { + [[no_unique_address]] Empty emp, emp2; + double d; +}; + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct F +// CHECK-NEXT: 0 | struct Empty emp (empty) +// CHECK-NEXT: 1 | struct Empty emp2 (empty) +// CHECK-NEXT: 0 | double d +// CHECK-NEXT: | [sizeof=8, dsize=8, align=4, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=4, preferrednvalign=8] + +int a = sizeof(Empty); +int b = sizeof(A); +int c = sizeof(B1); +int d = sizeof(C1); +int e = sizeof(D1); +int f = sizeof(E); +int g = sizeof(F); diff --git a/clang/test/Layout/aix-pack-attr-on-base.cpp b/clang/test/Layout/aix-pack-attr-on-base.cpp new file mode 100644 index 0000000000000..3d0ebabf79a82 --- /dev/null +++ b/clang/test/Layout/aix-pack-attr-on-base.cpp @@ -0,0 +1,20 @@ +// RUN: %clang_cc1 -emit-llvm -triple powerpc-ibm-aix-xcoff -x c++ < %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -emit-llvm -triple powerpc64-ibm-aix-xcoff -x c++ < %s | \ +// RUN: FileCheck %s + +struct A { + char x; +}; + +struct B { + int x; +}; + +struct __attribute__((__packed__)) C : A, B {} c; + +int s = sizeof(c); + +// CHECK: @c = global %struct.C zeroinitializer, align 1 +// CHECK: @s = global i32 5 diff --git a/clang/test/Layout/aix-power-alignment-typedef-2.cpp b/clang/test/Layout/aix-power-alignment-typedef-2.cpp new file mode 100644 index 0000000000000..8e7e3db47c602 --- /dev/null +++ b/clang/test/Layout/aix-power-alignment-typedef-2.cpp @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -S -emit-llvm -x c++ < %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -S -emit-llvm -x c++ < %s | \ +// RUN: FileCheck %s + +struct C { + double x; +}; + +typedef struct C __attribute__((__aligned__(2))) CC; + +CC cc; + +// CHECK: @cc = global %struct.C zeroinitializer, align 2 diff --git a/clang/test/Layout/aix-power-alignment-typedef.cpp b/clang/test/Layout/aix-power-alignment-typedef.cpp new file mode 100644 index 0000000000000..fc973a1fdfd81 --- /dev/null +++ b/clang/test/Layout/aix-power-alignment-typedef.cpp @@ -0,0 +1,39 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fdump-record-layouts %s | \ +// RUN: FileCheck %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fdump-record-layouts %s | \ +// RUN: FileCheck %s + +namespace test1 { +typedef double __attribute__((__aligned__(2))) Dbl; +struct A { + Dbl x; +}; + +int b = sizeof(A); + +// CHECK: 0 | struct test1::A +// CHECK-NEXT: 0 | test1::Dbl x +// CHECK-NEXT: | [sizeof=8, dsize=8, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=8, nvalign=2, preferrednvalign=2] + +} // namespace test1 + +namespace test2 { +typedef double Dbl __attribute__((__aligned__(2))); +typedef Dbl DblArr[]; + +union U { + DblArr da; + char x; +}; + +int x = sizeof(U); + +// CHECK: 0 | union test2::U +// CHECK-NEXT: 0 | test2::DblArr da +// CHECK-NEXT: 0 | char x +// CHECK-NEXT: | [sizeof=2, dsize=2, align=2, preferredalign=2, +// CHECK-NEXT: | nvsize=2, nvalign=2, preferrednvalign=2] + +} // namespace test2 diff --git a/clang/test/Layout/aix-virtual-function-and-base-with-double.cpp b/clang/test/Layout/aix-virtual-function-and-base-with-double.cpp new file mode 100644 index 0000000000000..d3bc4418db1fe --- /dev/null +++ b/clang/test/Layout/aix-virtual-function-and-base-with-double.cpp @@ -0,0 +1,112 @@ +// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK32 %s + +// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fdump-record-layouts \ +// RUN: -fsyntax-only %s | \ +// RUN: FileCheck --check-prefixes=CHECK,CHECK64 %s + +namespace test1 { +struct A { + double d1; + virtual void boo() {} +}; + +struct B { + double d2; + A a; +}; + +struct C : public A { + double d3; +}; + +int i = sizeof(B); +int j = sizeof(C); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::A +// CHECK-NEXT: 0 | (A vtable pointer) +// CHECK32-NEXT: 4 | double d1 +// CHECK32-NEXT: | [sizeof=12, dsize=12, align=4, preferredalign=4, +// CHECK32-NEXT: | nvsize=12, nvalign=4, preferrednvalign=4] +// CHECK64-NEXT: 8 | double d1 +// CHECK64-NEXT: | [sizeof=16, dsize=16, align=8, preferredalign=8, +// CHECK64-NEXT: | nvsize=16, nvalign=8, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::B +// CHECK-NEXT: 0 | double d2 +// CHECK-NEXT: 8 | struct test1::A a +// CHECK-NEXT: 8 | (A vtable pointer) +// CHECK32-NEXT: 12 | double d1 +// CHECK32-NEXT: | [sizeof=24, dsize=20, align=4, preferredalign=8, +// CHECK32-NEXT: | nvsize=20, nvalign=4, preferrednvalign=8] +// CHECK64-NEXT: 16 | double d1 +// CHECK64-NEXT: | [sizeof=24, dsize=24, align=8, preferredalign=8, +// CHECK64-NEXT: | nvsize=24, nvalign=8, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test1::C +// CHECK-NEXT: 0 | struct test1::A (primary base) +// CHECK-NEXT: 0 | (A vtable pointer) +// CHECK32-NEXT: 4 | double d1 +// CHECK32-NEXT: 12 | double d3 +// CHECK32-NEXT: | [sizeof=20, dsize=20, align=4, preferredalign=4, +// CHECK32-NEXT: | nvsize=20, nvalign=4, preferrednvalign=4] +// CHECK64-NEXT: 8 | double d1 +// CHECK64-NEXT: 16 | double d3 +// CHECK64-NEXT: | [sizeof=24, dsize=24, align=8, preferredalign=8, +// CHECK64-NEXT: | nvsize=24, nvalign=8, preferrednvalign=8] + +} // namespace test1 + +namespace test2 { +struct A { + long long l1; +}; + +struct B : public virtual A { + double d2; +}; + +#pragma pack(2) +struct C : public virtual A { + double __attribute__((aligned(4))) d3; +}; + +int i = sizeof(B); +int j = sizeof(C); + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test2::A +// CHECK-NEXT: 0 | long long l1 +// CHECK-NEXT: | [sizeof=8, dsize=8, align=8, preferredalign=8, +// CHECK-NEXT: | nvsize=8, nvalign=8, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test2::B +// CHECK-NEXT: 0 | (B vtable pointer) +// CHECK32-NEXT: 4 | double d2 +// CHECK64-NEXT: 8 | double d2 +// CHECK-NEXT: 16 | struct test2::A (virtual base) +// CHECK-NEXT: 16 | long long l1 +// CHECK-NEXT: | [sizeof=24, dsize=24, align=8, preferredalign=8, +// CHECK32-NEXT: | nvsize=12, nvalign=4, preferrednvalign=4] +// CHECK64-NEXT: | nvsize=16, nvalign=8, preferrednvalign=8] + +// CHECK: *** Dumping AST Record Layout +// CHECK-NEXT: 0 | struct test2::C +// CHECK-NEXT: 0 | (C vtable pointer) +// CHECK32-NEXT: 4 | double d3 +// CHECK32-NEXT: 12 | struct test2::A (virtual base) +// CHECK32-NEXT: 12 | long long l1 +// CHECK32-NEXT: | [sizeof=20, dsize=20, align=2, preferredalign=2, +// CHECK32-NEXT: | nvsize=12, nvalign=2, preferrednvalign=2] +// CHECK64-NEXT: 8 | double d3 +// CHECK64-NEXT: 16 | struct test2::A (virtual base) +// CHECK64-NEXT: 16 | long long l1 +// CHECK64-NEXT: | [sizeof=24, dsize=24, align=2, preferredalign=2, +// CHECK64-NEXT: | nvsize=16, nvalign=2, preferrednvalign=2] + +} // namespace test2 diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index 3a376a7caab46..bf6eaefe0b3ca 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -79,7 +79,7 @@ // PPC: error: unknown target CPU 'not-a-cpu' // PPC: note: valid target CPU values are: generic, 440, 450, 601, 602, 603, // PPC-SAME: 603e, 603ev, 604, 604e, 620, 630, g3, 7400, g4, 7450, g4+, 750, -// PPC-SAME: 8548, 970, g5, a2, a2q, e500, e500mc, e5500, power3, pwr3, power4, +// PPC-SAME: 8548, 970, g5, a2, e500, e500mc, e5500, power3, pwr3, power4, // PPC-SAME: pwr4, power5, pwr5, power5x, pwr5x, power6, pwr6, power6x, pwr6x, // PPC-SAME: power7, pwr7, power8, pwr8, power9, pwr9, power10, pwr10, powerpc, ppc, powerpc64, // PPC-SAME: ppc64, powerpc64le, ppc64le, future diff --git a/clang/test/Modules/module-private.cpp b/clang/test/Modules/module-private.cpp index 30957865d1cd0..a4b3b0fd21d39 100644 --- a/clang/test/Modules/module-private.cpp +++ b/clang/test/Modules/module-private.cpp @@ -1,7 +1,7 @@ // RUN: rm -rf %t -// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -fmodule-name=module_private_left -emit-module %S/Inputs/module.map -// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -fmodule-name=module_private_right -emit-module %S/Inputs/module.map -// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -I %S/Inputs %s -verify +// RUN: %clang_cc1 -std=c++17 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -fmodule-name=module_private_left -emit-module %S/Inputs/module.map +// RUN: %clang_cc1 -std=c++17 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -fmodule-name=module_private_right -emit-module %S/Inputs/module.map +// RUN: %clang_cc1 -std=c++17 -fmodules -fimplicit-module-maps -x objective-c++ -fmodules-cache-path=%t -I %S/Inputs %s -verify // FIXME: When we have a syntax for modules in C++, use that. @import module_private_left; @@ -79,11 +79,15 @@ __module_private__ struct public_class { }; // expected-error{{partial spec void local_var_private(__module_private__ int param) { // expected-error{{parameter 'param' cannot be declared __module_private__}} __module_private__ struct Local { int x, y; } local; //expected-error{{local variable 'local' cannot be declared __module_private__}} + __module_private__ auto [x, y] = local; // expected-error {{local variable '[x, y]' cannot be declared __module_private__}} + __module_private__ struct OtherLocal { int x; }; // expected-error{{local struct cannot be declared __module_private__}} typedef __module_private__ int local_typedef; // expected-error{{typedef 'local_typedef' cannot be declared __module_private__}} } +void param_private(__module_private__ int) {} // expected-error {{parameter '' cannot be declared __module_private}} + // Check struct size struct LikeVisibleStruct { int field; diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp new file mode 100644 index 0000000000000..0b6f2d40ffe87 --- /dev/null +++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp @@ -0,0 +1,43 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +#define N 1000 + +int test_amdgcn_target_tid_threads() { +// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads + + int arr[N]; + +// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0) +// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32 +// CHECK-NEXT: sub nuw i32 [[VAR]], 64 +// CHECK: call i32 @llvm.amdgcn.workitem.id.x() +#pragma omp target + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} + +int test_amdgcn_target_tid_threads_simd() { +// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads_simd + + int arr[N]; + +// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0) +// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32 +// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0) +#pragma omp target simd + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + return arr[0]; +} + +#endif diff --git a/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp b/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp new file mode 100644 index 0000000000000..4ed953a9ebf7b --- /dev/null +++ b/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp @@ -0,0 +1,24 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// expected-no-diagnostics + +#define N 100 + +int test_amdgcn_target_temp_alloca() { + // CHECK-LABEL: test_amdgcn_target_temp_alloca + + int arr[N]; + + // CHECK: [[VAR_ADDR:%.+]] = alloca [100 x i32]*, align 8, addrspace(5) + // CHECK-NEXT: [[VAR_ADDR_CAST:%.+]] = addrspacecast [100 x i32]* addrspace(5)* [[VAR_ADDR]] to [100 x i32]** + // CHECK: store [100 x i32]* [[VAR:%.+]], [100 x i32]** [[VAR_ADDR_CAST]], align 8 + +#pragma omp target + for (int i = 0; i < N; i++) { + arr[i] = 1; + } + + return arr[0]; +} diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp index a21a9db1e39a8..0942c7cf42369 100644 --- a/clang/test/OpenMP/cancel_codegen.cpp +++ b/clang/test/OpenMP/cancel_codegen.cpp @@ -16,7 +16,6 @@ float flag; int main (int argc, char **argv) { -// ALL: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num( #pragma omp parallel { #pragma omp cancel parallel if(flag) @@ -42,14 +41,14 @@ int main (int argc, char **argv) { } } // ALL: call void @__kmpc_for_static_init_4( -// ALL: [[RES:%.+]] = call i32 @__kmpc_cancel(%struct.ident_t* {{[^,]+}}, i32 [[GTID]], i32 3) +// ALL: [[RES:%.+]] = call i32 @__kmpc_cancel(%struct.ident_t* {{[^,]+}}, i32 [[GTID:%.*]], i32 3) // ALL: [[CMP:%.+]] = icmp ne i32 [[RES]], 0 // ALL: br i1 [[CMP]], label %[[EXIT:[^,].+]], label %[[CONTINUE:.+]] // ALL: [[EXIT]] // ALL: br label // ALL: [[CONTINUE]] // ALL: br label -// ALL: [[RES:%.+]] = call i32 @__kmpc_cancel(%struct.ident_t* {{[^,]+}}, i32 [[GTID]], i32 3) +// ALL: [[RES:%.+]] = call i32 @__kmpc_cancel(%struct.ident_t* {{[^,]+}}, i32 [[GTID:%.*]], i32 3) // ALL: [[CMP:%.+]] = icmp ne i32 [[RES]], 0 // ALL: br i1 [[CMP]], label %[[EXIT:[^,].+]], label %[[CONTINUE:.+]] // ALL: [[EXIT]] @@ -66,7 +65,7 @@ for (int i = 0; i < argc; ++i) { // ALL: [[BOOL:%.+]] = fcmp une float [[FLAG]], 0.000000e+00 // ALL: br i1 [[BOOL]], label %[[THEN:[^,]+]], label %[[ELSE:[^,]+]] // ALL: [[THEN]] -// ALL: [[RES:%.+]] = call i32 @__kmpc_cancel(%struct.ident_t* {{[^,]+}}, i32 [[GTID]], i32 2) +// ALL: [[RES:%.+]] = call i32 @__kmpc_cancel(%struct.ident_t* {{[^,]+}}, i32 [[GTID:%.*]], i32 2) // ALL: [[CMP:%.+]] = icmp ne i32 [[RES]], 0 // ALL: br i1 [[CMP]], label %[[EXIT:[^,].+]], label %[[CONTINUE:.+]] // ALL: [[EXIT]] @@ -148,7 +147,7 @@ for (int i = 0; i < argc; ++i) { // CHECK: br label // CHECK: [[CONTINUE]] // CHECK: br label -// CHECK: [[RES:%.+]] = call i32 @__kmpc_cancel(%struct.ident_t* {{[^,]+}}, i32 [[GTID]], i32 3) +// CHECK: [[RES:%.+]] = call i32 @__kmpc_cancel(%struct.ident_t* {{[^,]+}}, i32 [[GTID:%.*]], i32 3) // CHECK: [[CMP:%.+]] = icmp ne i32 [[RES]], 0 // CHECK: br i1 [[CMP]], label %[[EXIT:[^,].+]], label %[[CONTINUE:.+]] // CHECK: [[EXIT]] diff --git a/clang/test/OpenMP/declare_mapper_ast_print.c b/clang/test/OpenMP/declare_mapper_ast_print.c index e82bc4c2d3109..b7c9976c122af 100644 --- a/clang/test/OpenMP/declare_mapper_ast_print.c +++ b/clang/test/OpenMP/declare_mapper_ast_print.c @@ -5,6 +5,14 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -ast-print %s | FileCheck %s // RUN: %clang_cc1 -fopenmp-simd -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s + +// RUN: %clang_cc1 -DOMP51 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s +// RUN: %clang_cc1 -DOMP51 -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -DOMP51 -fopenmp -fopenmp-version=51 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s + +// RUN: %clang_cc1 -DOMP51 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s +// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -fopenmp-version=51 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s // expected-no-diagnostics #ifndef HEADER @@ -50,6 +58,12 @@ int main() { { dd[0].i++; } #pragma omp target update to(mapper(id): vv) from(mapper(default): dd[0:10]) // CHECK: #pragma omp target update to(mapper(id): vv) from(mapper(default): dd[0:10]) +#ifdef OMP51 +#pragma omp target update to(mapper(id) present: vv) from(mapper(default), present: dd[0:10]) +// OMP51: #pragma omp target update to(mapper(id), present: vv) from(mapper(default), present: dd[0:10]) +#pragma omp target update to(present mapper(id): vv) from(present, mapper(default): dd[0:10]) +// OMP51: #pragma omp target update to(present, mapper(id): vv) from(present, mapper(default): dd[0:10]) +#endif } return 0; } diff --git a/clang/test/OpenMP/declare_mapper_codegen.cpp b/clang/test/OpenMP/declare_mapper_codegen.cpp index ee64fe2099007..2fd4b3cb7ed07 100644 --- a/clang/test/OpenMP/declare_mapper_codegen.cpp +++ b/clang/test/OpenMP/declare_mapper_codegen.cpp @@ -839,4 +839,242 @@ void foo(int a){ #endif // CK3 +///==========================================================================/// +// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm -femit-all-decls -disable-llvm-passes %s -o - | FileCheck --check-prefix CK4 --check-prefix CK4-64 %s +// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -femit-all-decls -disable-llvm-passes -o %t %s +// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -femit-all-decls -disable-llvm-passes -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix CK4 --check-prefix CK4-64 %s +// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm -femit-all-decls -disable-llvm-passes %s -o - | FileCheck --check-prefix CK4 --check-prefix CK4-32 %s +// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -femit-all-decls -disable-llvm-passes -o %t %s +// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -femit-all-decls -disable-llvm-passes -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix CK4 --check-prefix CK4-32 %s + +// RUN: %clang_cc1 -DCK4 -verify -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm -femit-all-decls -disable-llvm-passes %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -DCK4 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -femit-all-decls -disable-llvm-passes -o %t %s +// RUN: %clang_cc1 -DCK4 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -femit-all-decls -disable-llvm-passes -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -DCK4 -verify -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm -femit-all-decls -disable-llvm-passes %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -DCK4 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -femit-all-decls -disable-llvm-passes -o %t %s +// RUN: %clang_cc1 -DCK4 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -femit-all-decls -disable-llvm-passes -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s + +#ifdef CK4 +// Mapper function code generation and runtime interface. + +// CK4-64: [[TSIZES:@.+]] = {{.+}}constant [1 x i64] [i64 16] +// CK4-32: [[TSIZES:@.+]] = {{.+}}constant [1 x i64] [i64 8] +// PRESENT=0x1000 | TARGET_PARAM=0x20 | TO=0x1 = 0x1021 +// CK4: [[TTYPES:@.+]] = {{.+}}constant [1 x i64] [i64 [[#0x1021]]] + +// CK4-64: [[FSIZES:@.+]] = {{.+}}constant [1 x i64] [i64 16] +// CK4-32: [[FSIZES:@.+]] = {{.+}}constant [1 x i64] [i64 8] +// PRESENT=0x1000 | TARGET_PARAM=0x20 | FROM=0x2 = 0x1022 +// CK4: [[FTYPES:@.+]] = {{.+}}constant [1 x i64] [i64 [[#0x1022]]] + +class C { +public: + int a; + double *b; +}; + +#pragma omp declare mapper(id: C s) map(s.a, s.b[0:2]) + +// CK4: define {{.*}}void [[MPRFUNC:@[.]omp_mapper[.].*C[.]id]](i8*{{.*}}, i8*{{.*}}, i8*{{.*}}, i64{{.*}}, i64{{.*}}) +// CK4: store i8* %{{[^,]+}}, i8** [[HANDLEADDR:%[^,]+]] +// CK4: store i8* %{{[^,]+}}, i8** [[BPTRADDR:%[^,]+]] +// CK4: store i8* %{{[^,]+}}, i8** [[VPTRADDR:%[^,]+]] +// CK4: store i64 %{{[^,]+}}, i{{64|32}}* [[SIZEADDR:%[^,]+]] +// CK4: store i64 %{{[^,]+}}, i64* [[TYPEADDR:%[^,]+]] +// CK4-DAG: [[BYTESIZE:%.+]] = load i64, i64* [[SIZEADDR]] +// CK4-64-DAG: [[SIZE:%.+]] = udiv exact i64 [[BYTESIZE]], 16 +// CK4-32-DAG: [[SIZE:%.+]] = udiv exact i64 [[BYTESIZE]], 8 +// CK4-DAG: [[TYPE:%.+]] = load i64, i64* [[TYPEADDR]] +// CK4-DAG: [[HANDLE:%.+]] = load i8*, i8** [[HANDLEADDR]] +// CK4-DAG: [[PTRBEGIN:%.+]] = bitcast i8** [[VPTRADDR]] to %class.C** +// CK4-DAG: [[PTREND:%.+]] = getelementptr %class.C*, %class.C** [[PTRBEGIN]], i64 [[SIZE]] +// CK4-DAG: [[BPTR:%.+]] = load i8*, i8** [[BPTRADDR]] +// CK4-DAG: [[BEGIN:%.+]] = load i8*, i8** [[VPTRADDR]] +// CK4: [[ISARRAY:%.+]] = icmp sge i64 [[SIZE]], 1 +// CK4: br i1 [[ISARRAY]], label %[[INITEVALDEL:[^,]+]], label %[[LHEAD:[^,]+]] + +// CK4: [[INITEVALDEL]] +// CK4: [[TYPEDEL:%.+]] = and i64 [[TYPE]], 8 +// CK4: [[ISNOTDEL:%.+]] = icmp eq i64 [[TYPEDEL]], 0 +// CK4: br i1 [[ISNOTDEL]], label %[[INIT:[^,]+]], label %[[LHEAD:[^,]+]] +// CK4: [[INIT]] +// CK4-64-DAG: [[ARRSIZE:%.+]] = mul nuw i64 [[SIZE]], 16 +// CK4-32-DAG: [[ARRSIZE:%.+]] = mul nuw i64 [[SIZE]], 8 +// CK4-DAG: [[ITYPE:%.+]] = and i64 [[TYPE]], -4 +// CK4: call void @__tgt_push_mapper_component(i8* [[HANDLE]], i8* [[BPTR]], i8* [[BEGIN]], i64 [[ARRSIZE]], i64 [[ITYPE]]) +// CK4: br label %[[LHEAD:[^,]+]] + +// CK4: [[LHEAD]] +// CK4: [[ISEMPTY:%.+]] = icmp eq %class.C** [[PTRBEGIN]], [[PTREND]] +// CK4: br i1 [[ISEMPTY]], label %[[DONE:[^,]+]], label %[[LBODY:[^,]+]] +// CK4: [[LBODY]] +// CK4: [[PTR:%.+]] = phi %class.C** [ [[PTRBEGIN]], %[[LHEAD]] ], [ [[PTRNEXT:%.+]], %[[LCORRECT:[^,]+]] ] +// CK4: [[OBJ:%.+]] = load %class.C*, %class.C** [[PTR]] +// CK4-DAG: [[ABEGIN:%.+]] = getelementptr inbounds %class.C, %class.C* [[OBJ]], i32 0, i32 0 +// CK4-DAG: [[BBEGIN:%.+]] = getelementptr inbounds %class.C, %class.C* [[OBJ]], i32 0, i32 1 +// CK4-DAG: [[BBEGIN2:%.+]] = getelementptr inbounds %class.C, %class.C* [[OBJ]], i32 0, i32 1 +// CK4-DAG: [[BARRBEGIN:%.+]] = load double*, double** [[BBEGIN2]] +// CK4-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds double, double* [[BARRBEGIN]], i[[sz:64|32]] 0 +// CK4-DAG: [[BEND:%.+]] = getelementptr double*, double** [[BBEGIN]], i32 1 +// CK4-DAG: [[ABEGINV:%.+]] = bitcast i32* [[ABEGIN]] to i8* +// CK4-DAG: [[BENDV:%.+]] = bitcast double** [[BEND]] to i8* +// CK4-DAG: [[ABEGINI:%.+]] = ptrtoint i8* [[ABEGINV]] to i64 +// CK4-DAG: [[BENDI:%.+]] = ptrtoint i8* [[BENDV]] to i64 +// CK4-DAG: [[CSIZE:%.+]] = sub i64 [[BENDI]], [[ABEGINI]] +// CK4-DAG: [[CUSIZE:%.+]] = sdiv exact i64 [[CSIZE]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64) +// CK4-DAG: [[BPTRADDR0BC:%.+]] = bitcast %class.C* [[OBJ]] to i8* +// CK4-DAG: [[PTRADDR0BC:%.+]] = bitcast i32* [[ABEGIN]] to i8* +// CK4-DAG: [[PRESIZE:%.+]] = call i64 @__tgt_mapper_num_components(i8* [[HANDLE]]) +// CK4-DAG: [[SHIPRESIZE:%.+]] = shl i64 [[PRESIZE]], 48 +// CK4-DAG: br label %[[MEMBER:[^,]+]] +// CK4-DAG: [[MEMBER]] +// CK4-DAG: br i1 true, label %[[LTYPE:[^,]+]], label %[[MEMBERCOM:[^,]+]] +// CK4-DAG: [[MEMBERCOM]] +// CK4-DAG: [[MEMBERCOMTYPE:%.+]] = add nuw i64 32, [[SHIPRESIZE]] +// CK4-DAG: br label %[[LTYPE]] +// CK4-DAG: [[LTYPE]] +// CK4-DAG: [[MEMBERTYPE:%.+]] = phi i64 [ 32, %[[MEMBER]] ], [ [[MEMBERCOMTYPE]], %[[MEMBERCOM]] ] +// CK4-DAG: [[TYPETF:%.+]] = and i64 [[TYPE]], 3 +// CK4-DAG: [[ISALLOC:%.+]] = icmp eq i64 [[TYPETF]], 0 +// CK4-DAG: br i1 [[ISALLOC]], label %[[ALLOC:[^,]+]], label %[[ALLOCELSE:[^,]+]] +// CK4-DAG: [[ALLOC]] +// CK4-DAG: [[ALLOCTYPE:%.+]] = and i64 [[MEMBERTYPE]], -4 +// CK4-DAG: br label %[[TYEND:[^,]+]] +// CK4-DAG: [[ALLOCELSE]] +// CK4-DAG: [[ISTO:%.+]] = icmp eq i64 [[TYPETF]], 1 +// CK4-DAG: br i1 [[ISTO]], label %[[TO:[^,]+]], label %[[TOELSE:[^,]+]] +// CK4-DAG: [[TO]] +// CK4-DAG: [[TOTYPE:%.+]] = and i64 [[MEMBERTYPE]], -3 +// CK4-DAG: br label %[[TYEND]] +// CK4-DAG: [[TOELSE]] +// CK4-DAG: [[ISFROM:%.+]] = icmp eq i64 [[TYPETF]], 2 +// CK4-DAG: br i1 [[ISFROM]], label %[[FROM:[^,]+]], label %[[TYEND]] +// CK4-DAG: [[FROM]] +// CK4-DAG: [[FROMTYPE:%.+]] = and i64 [[MEMBERTYPE]], -2 +// CK4-DAG: br label %[[TYEND]] +// CK4-DAG: [[TYEND]] +// CK4-DAG: [[PHITYPE0:%.+]] = phi i64 [ [[ALLOCTYPE]], %[[ALLOC]] ], [ [[TOTYPE]], %[[TO]] ], [ [[FROMTYPE]], %[[FROM]] ], [ [[MEMBERTYPE]], %[[TOELSE]] ] +// CK4: call void @__tgt_push_mapper_component(i8* [[HANDLE]], i8* [[BPTRADDR0BC]], i8* [[PTRADDR0BC]], i64 [[CUSIZE]], i64 [[PHITYPE0]]) +// CK4-DAG: [[BPTRADDR1BC:%.+]] = bitcast %class.C* [[OBJ]] to i8* +// CK4-DAG: [[PTRADDR1BC:%.+]] = bitcast i32* [[ABEGIN]] to i8* +// CK4-DAG: br label %[[MEMBER:[^,]+]] +// CK4-DAG: [[MEMBER]] +// CK4-DAG: br i1 false, label %[[LTYPE:[^,]+]], label %[[MEMBERCOM:[^,]+]] +// CK4-DAG: [[MEMBERCOM]] +// 281474976710659 == 0x1,000,000,003 +// CK4-DAG: [[MEMBERCOMTYPE:%.+]] = add nuw i64 281474976710659, [[SHIPRESIZE]] +// CK4-DAG: br label %[[LTYPE]] +// CK4-DAG: [[LTYPE]] +// CK4-DAG: [[MEMBERTYPE:%.+]] = phi i64 [ 281474976710659, %[[MEMBER]] ], [ [[MEMBERCOMTYPE]], %[[MEMBERCOM]] ] +// CK4-DAG: [[TYPETF:%.+]] = and i64 [[TYPE]], 3 +// CK4-DAG: [[ISALLOC:%.+]] = icmp eq i64 [[TYPETF]], 0 +// CK4-DAG: br i1 [[ISALLOC]], label %[[ALLOC:[^,]+]], label %[[ALLOCELSE:[^,]+]] +// CK4-DAG: [[ALLOC]] +// CK4-DAG: [[ALLOCTYPE:%.+]] = and i64 [[MEMBERTYPE]], -4 +// CK4-DAG: br label %[[TYEND:[^,]+]] +// CK4-DAG: [[ALLOCELSE]] +// CK4-DAG: [[ISTO:%.+]] = icmp eq i64 [[TYPETF]], 1 +// CK4-DAG: br i1 [[ISTO]], label %[[TO:[^,]+]], label %[[TOELSE:[^,]+]] +// CK4-DAG: [[TO]] +// CK4-DAG: [[TOTYPE:%.+]] = and i64 [[MEMBERTYPE]], -3 +// CK4-DAG: br label %[[TYEND]] +// CK4-DAG: [[TOELSE]] +// CK4-DAG: [[ISFROM:%.+]] = icmp eq i64 [[TYPETF]], 2 +// CK4-DAG: br i1 [[ISFROM]], label %[[FROM:[^,]+]], label %[[TYEND]] +// CK4-DAG: [[FROM]] +// CK4-DAG: [[FROMTYPE:%.+]] = and i64 [[MEMBERTYPE]], -2 +// CK4-DAG: br label %[[TYEND]] +// CK4-DAG: [[TYEND]] +// CK4-DAG: [[TYPE1:%.+]] = phi i64 [ [[ALLOCTYPE]], %[[ALLOC]] ], [ [[TOTYPE]], %[[TO]] ], [ [[FROMTYPE]], %[[FROM]] ], [ [[MEMBERTYPE]], %[[TOELSE]] ] +// CK4: call void @__tgt_push_mapper_component(i8* [[HANDLE]], i8* [[BPTRADDR1BC]], i8* [[PTRADDR1BC]], i64 4, i64 [[TYPE1]]) +// CK4-DAG: [[BPTRADDR2BC:%.+]] = bitcast double** [[BBEGIN]] to i8* +// CK4-DAG: [[PTRADDR2BC:%.+]] = bitcast double* [[BARRBEGINGEP]] to i8* +// CK4-DAG: br label %[[MEMBER:[^,]+]] +// CK4-DAG: [[MEMBER]] +// CK4-DAG: br i1 false, label %[[LTYPE:[^,]+]], label %[[MEMBERCOM:[^,]+]] +// CK4-DAG: [[MEMBERCOM]] +// 281474976710675 == 0x1,000,000,013 +// CK4-DAG: [[MEMBERCOMTYPE:%.+]] = add nuw i64 281474976710675, [[SHIPRESIZE]] +// CK4-DAG: br label %[[LTYPE]] +// CK4-DAG: [[LTYPE]] +// CK4-DAG: [[MEMBERTYPE:%.+]] = phi i64 [ 281474976710675, %[[MEMBER]] ], [ [[MEMBERCOMTYPE]], %[[MEMBERCOM]] ] +// CK4-DAG: [[TYPETF:%.+]] = and i64 [[TYPE]], 3 +// CK4-DAG: [[ISALLOC:%.+]] = icmp eq i64 [[TYPETF]], 0 +// CK4-DAG: br i1 [[ISALLOC]], label %[[ALLOC:[^,]+]], label %[[ALLOCELSE:[^,]+]] +// CK4-DAG: [[ALLOC]] +// CK4-DAG: [[ALLOCTYPE:%.+]] = and i64 [[MEMBERTYPE]], -4 +// CK4-DAG: br label %[[TYEND:[^,]+]] +// CK4-DAG: [[ALLOCELSE]] +// CK4-DAG: [[ISTO:%.+]] = icmp eq i64 [[TYPETF]], 1 +// CK4-DAG: br i1 [[ISTO]], label %[[TO:[^,]+]], label %[[TOELSE:[^,]+]] +// CK4-DAG: [[TO]] +// CK4-DAG: [[TOTYPE:%.+]] = and i64 [[MEMBERTYPE]], -3 +// CK4-DAG: br label %[[TYEND]] +// CK4-DAG: [[TOELSE]] +// CK4-DAG: [[ISFROM:%.+]] = icmp eq i64 [[TYPETF]], 2 +// CK4-DAG: br i1 [[ISFROM]], label %[[FROM:[^,]+]], label %[[TYEND]] +// CK4-DAG: [[FROM]] +// CK4-DAG: [[FROMTYPE:%.+]] = and i64 [[MEMBERTYPE]], -2 +// CK4-DAG: br label %[[TYEND]] +// CK4-DAG: [[TYEND]] +// CK4-DAG: [[TYPE2:%.+]] = phi i64 [ [[ALLOCTYPE]], %[[ALLOC]] ], [ [[TOTYPE]], %[[TO]] ], [ [[FROMTYPE]], %[[FROM]] ], [ [[MEMBERTYPE]], %[[TOELSE]] ] +// CK4: call void @__tgt_push_mapper_component(i8* [[HANDLE]], i8* [[BPTRADDR2BC]], i8* [[PTRADDR2BC]], i64 16, i64 [[TYPE2]]) +// CK4: [[PTRNEXT]] = getelementptr %class.C*, %class.C** [[PTR]], i32 1 +// CK4: [[ISDONE:%.+]] = icmp eq %class.C** [[PTRNEXT]], [[PTREND]] +// CK4: br i1 [[ISDONE]], label %[[LEXIT:[^,]+]], label %[[LBODY]] + +// CK4: [[LEXIT]] +// CK4: [[ISARRAY:%.+]] = icmp sge i64 [[SIZE]], 1 +// CK4: br i1 [[ISARRAY]], label %[[EVALDEL:[^,]+]], label %[[DONE]] +// CK4: [[EVALDEL]] +// CK4: [[TYPEDEL:%.+]] = and i64 [[TYPE]], 8 +// CK4: [[ISDEL:%.+]] = icmp ne i64 [[TYPEDEL]], 0 +// CK4: br i1 [[ISDEL]], label %[[DEL:[^,]+]], label %[[DONE]] +// CK4: [[DEL]] +// CK4-64-DAG: [[ARRSIZE:%.+]] = mul nuw i64 [[SIZE]], 16 +// CK4-32-DAG: [[ARRSIZE:%.+]] = mul nuw i64 [[SIZE]], 8 +// CK4-DAG: [[DTYPE:%.+]] = and i64 [[TYPE]], -4 +// CK4: call void @__tgt_push_mapper_component(i8* [[HANDLE]], i8* [[BPTR]], i8* [[BEGIN]], i64 [[ARRSIZE]], i64 [[DTYPE]]) +// CK4: br label %[[DONE]] +// CK4: [[DONE]] +// CK4: ret void + + +// CK4-LABEL: define {{.*}}void @{{.*}}foo{{.*}} +void foo(int a){ + int i = a; + C c; + c.a = a; + + // CK4-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[TGEPBP:%.+]], i8** [[TGEPP:%.+]], i64* getelementptr {{.+}}[1 x i64]* [[TSIZES]], i32 0, i32 0), {{.+}}getelementptr {{.+}}[1 x i64]* [[TTYPES]]{{.+}}, i8** [[TMPRGEP:%.+]]) + // CK4-DAG: [[TGEPBP]] = getelementptr inbounds {{.+}}[[TBP:%[^,]+]], i{{.+}} 0, i{{.+}} 0 + // CK4-DAG: [[TGEPP]] = getelementptr inbounds {{.+}}[[TP:%[^,]+]], i{{.+}} 0, i{{.+}} 0 + // CK4-DAG: [[TMPRGEP]] = bitcast [1 x i8*]* [[TMPR:%[^,]+]] to i8** + // CK4-DAG: [[TBP0:%.+]] = getelementptr inbounds {{.+}}[[TBP]], i{{.+}} 0, i{{.+}} 0 + // CK4-DAG: [[TP0:%.+]] = getelementptr inbounds {{.+}}[[TP]], i{{.+}} 0, i{{.+}} 0 + // CK4-DAG: [[TMPR1:%.+]] = getelementptr inbounds {{.+}}[[TMPR]], i[[sz]] 0, i[[sz]] 0 + // CK4-DAG: [[TCBP0:%.+]] = bitcast i8** [[TBP0]] to %class.C** + // CK4-DAG: [[TCP0:%.+]] = bitcast i8** [[TP0]] to %class.C** + // CK4-DAG: store %class.C* [[VAL:%[^,]+]], %class.C** [[TCBP0]] + // CK4-DAG: store %class.C* [[VAL]], %class.C** [[TCP0]] + // CK4-DAG: store i8* bitcast (void (i8*, i8*, i8*, i64, i64)* [[MPRFUNC]] to i8*), i8** [[TMPR1]] + #pragma omp target update to(present, mapper(id): c) + + // CK4-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[FGEPBP:%.+]], i8** [[FGEPP:%.+]], i64* getelementptr {{.+}}[1 x i64]* [[FSIZES]], i32 0, i32 0), {{.+}}getelementptr {{.+}}[1 x i64]* [[FTYPES]]{{.+}}, i8** [[FMPRGEP:%.+]]) + // CK4-DAG: [[FGEPBP]] = getelementptr inbounds {{.+}}[[FBP:%[^,]+]], i{{.+}} 0, i{{.+}} 0 + // CK4-DAG: [[FGEPP]] = getelementptr inbounds {{.+}}[[FP:%[^,]+]], i{{.+}} 0, i{{.+}} 0 + // CK4-DAG: [[FMPRGEP]] = bitcast [1 x i8*]* [[FMPR:%[^,]+]] to i8** + // CK4-DAG: [[FBP0:%.+]] = getelementptr inbounds {{.+}}[[FBP]], i{{.+}} 0, i{{.+}} 0 + // CK4-DAG: [[FP0:%.+]] = getelementptr inbounds {{.+}}[[FP]], i{{.+}} 0, i{{.+}} 0 + // CK4-DAG: [[FMPR1:%.+]] = getelementptr inbounds {{.+}}[[FMPR]], i[[sz]] 0, i[[sz]] 0 + // CK4-DAG: [[FCBP0:%.+]] = bitcast i8** [[FBP0]] to %class.C** + // CK4-DAG: [[FCP0:%.+]] = bitcast i8** [[FP0]] to %class.C** + // CK4-DAG: store %class.C* [[VAL]], %class.C** [[FCBP0]] + // CK4-DAG: store %class.C* [[VAL]], %class.C** [[FCP0]] + // CK4-DAG: store i8* bitcast (void (i8*, i8*, i8*, i64, i64)* [[MPRFUNC]] to i8*), i8** [[FMPR1]] + #pragma omp target update from(mapper(id), present: c) +} + +#endif // CK4 + #endif // HEADER diff --git a/clang/test/OpenMP/declare_variant_device_isa_codegen_1.c b/clang/test/OpenMP/declare_variant_device_isa_codegen_1.c new file mode 100644 index 0000000000000..76a3eedeae301 --- /dev/null +++ b/clang/test/OpenMP/declare_variant_device_isa_codegen_1.c @@ -0,0 +1,49 @@ +// RUN: %clang_cc1 -verify -fopenmp -x c -triple %itanium_abi_triple -emit-llvm %s -o - -fopenmp-version=50 | FileCheck %s --check-prefix=GENERIC +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck %s --check-prefix=GENERIC + +// RUN: %clang_cc1 -target-feature +avx512f -verify -fopenmp -x c -triple %itanium_abi_triple -emit-llvm %s -o - -fopenmp-version=50 | FileCheck %s --check-prefix=WITHFEATURE +// RUN: %clang_cc1 -target-feature +avx512f -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t -fopenmp-version=50 %s +// RUN: %clang_cc1 -target-feature +avx512f -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=50 | FileCheck %s --check-prefix=WITHFEATURE + +// expected-no-diagnostics + +// Test taken from PR46338 (by linna su) + +#ifndef HEADER +#define HEADER + +void base_saxpy(int, float, float *, float *); +void avx512_saxpy(int, float, float *, float *); + +#pragma omp declare variant(avx512_saxpy) \ + match(device = {isa(avx512f)}) +void base_saxpy(int n, float s, float *x, float *y) { +#pragma omp parallel for + for (int i = 0; i < n; i++) + y[i] = s * x[i] + y[i]; +} + +void avx512_saxpy(int n, float s, float *x, float *y) { +#pragma omp parallel for simd simdlen(16) aligned(x, y : 64) + for (int i = 0; i < n; i++) + y[i] = s * x[i] + y[i]; +} + +void caller(int n, float s, float *x, float *y) { + // GENERIC: define {{.*}}void @{{.*}}caller + // GENERIC: call void @{{.*}}base_saxpy + // WITHFEATURE: define {{.*}}void @{{.*}}caller + // WITHFEATURE: call void @{{.*}}avx512_saxpy + base_saxpy(n, s, x, y); +} + +__attribute__((target("avx512f"))) void variant_caller(int n, float s, float *x, float *y) { + // GENERIC: define {{.*}}void @{{.*}}variant_caller + // GENERIC: call void @{{.*}}avx512_saxpy + // WITHFEATURE: define {{.*}}void @{{.*}}variant_caller + // WITHFEATURE: call void @{{.*}}avx512_saxpy + base_saxpy(n, s, x, y); +} + +#endif diff --git a/clang/test/OpenMP/declare_variant_messages.c b/clang/test/OpenMP/declare_variant_messages.c index ecbf022351a75..84a56c5fd4094 100644 --- a/clang/test/OpenMP/declare_variant_messages.c +++ b/clang/test/OpenMP/declare_variant_messages.c @@ -137,6 +137,18 @@ void marked_variant(void); #pragma omp declare variant(marked_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-warning {{variant function in '#pragma omp declare variant' is itself marked as '#pragma omp declare variant'}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}} void marked(void); +#pragma omp declare variant(foo) match(device = {isa("foo")}) +int unknown_isa_trait(void); +#pragma omp declare variant(foo) match(device = {isa(foo)}) +int unknown_isa_trait2(void); +#pragma omp declare variant(foo) match(device = {kind(fpga), isa(bar)}) +int ignored_isa_trait(void); + +void caller() { + unknown_isa_trait(); // expected-warning {{isa trait 'foo' is not known to the current target; verify the spelling or consider restricting the context selector with the 'arch' selector further}} + unknown_isa_trait2(); // expected-warning {{isa trait 'foo' is not known to the current target; verify the spelling or consider restricting the context selector with the 'arch' selector further}} + ignored_isa_trait(); +} #pragma omp declare variant // expected-error {{function declaration is expected after 'declare variant' directive}} diff --git a/clang/test/OpenMP/irbuilder_nested_parallel_for.c b/clang/test/OpenMP/irbuilder_nested_parallel_for.c new file mode 100644 index 0000000000000..929a92827689b --- /dev/null +++ b/clang/test/OpenMP/irbuilder_nested_parallel_for.c @@ -0,0 +1,299 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-enable-irbuilder -x c++ -emit-llvm %s -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefixes=CHECK %s +// RUN: %clang_cc1 -fopenmp -fopenmp-enable-irbuilder -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -verify %s -emit-llvm -o - | FileCheck --check-prefixes=CHECK-DEBUG %s + +// expected-no-diagnostics + +// TODO: Teach the update script to check new functions too. + +#ifndef HEADER +#define HEADER + +// CHECK-LABEL: @_Z14parallel_for_0v( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) +// CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +// CHECK: omp_parallel: +// CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @_Z14parallel_for_0v..omp_par to void (i32*, i32*, ...)*)) +// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +// CHECK: omp.par.outlined.exit: +// CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +// CHECK: omp.par.exit.split: +// CHECK-NEXT: ret void +// +// CHECK-DEBUG-LABEL: @_Z14parallel_for_0v( +// CHECK-DEBUG-NEXT: entry: +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]] +// CHECK-DEBUG: omp_parallel: +// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @_Z14parallel_for_0v..omp_par to void (i32*, i32*, ...)*)), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] +// CHECK-DEBUG: omp.par.outlined.exit: +// CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +// CHECK-DEBUG: omp.par.exit.split: +// CHECK-DEBUG-NEXT: ret void, !dbg !{{[0-9]*}} +// +void parallel_for_0(void) { +#pragma omp parallel + { +#pragma omp for + for (int i = 0; i < 100; ++i) { + } + } +} + +// CHECK-LABEL: @_Z14parallel_for_1Pfid( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[R_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 +// CHECK-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) +// CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +// CHECK: omp_parallel: +// CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.1 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]) +// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT19:%.*]] +// CHECK: omp.par.outlined.exit19: +// CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +// CHECK: omp.par.exit.split: +// CHECK-NEXT: ret void +// +// CHECK-DEBUG-LABEL: @_Z14parallel_for_1Pfid( +// CHECK-DEBUG-NEXT: entry: +// CHECK-DEBUG-NEXT: [[R_ADDR:%.*]] = alloca float*, align 8 +// CHECK-DEBUG-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 +// CHECK-DEBUG-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8 +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @12), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]] +// CHECK-DEBUG: omp_parallel: +// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @12, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.1 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT19:%.*]] +// CHECK-DEBUG: omp.par.outlined.exit19: +// CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +// CHECK-DEBUG: omp.par.exit.split: +// CHECK-DEBUG-NEXT: ret void, !dbg !{{[0-9]*}} +// +void parallel_for_1(float *r, int a, double b) { +#pragma omp parallel + { +#pragma omp parallel + { +#pragma omp for + for (int i = 0; i < 100; ++i) { + *r = a + b; + } + } + } +} + +// CHECK-LABEL: @_Z14parallel_for_2Pfid( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[R_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 +// CHECK-NEXT: [[DOTOMP_IV212:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP213:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB214:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB215:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE216:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST217:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I218:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 +// CHECK-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) +// CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] +// CHECK: omp_parallel: +// CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]) +// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT211:%.*]] +// CHECK: omp.par.outlined.exit211: +// CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +// CHECK: omp.par.exit.split: +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB214]], align 4 +// CHECK-NEXT: store i32 99, i32* [[DOTOMP_UB215]], align 4 +// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE216]], align 4 +// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST217]], align 4 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM219:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @41) +// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @2, i32 [[OMP_GLOBAL_THREAD_NUM219]], i32 34, i32* [[DOTOMP_IS_LAST217]], i32* [[DOTOMP_LB214]], i32* [[DOTOMP_UB215]], i32* [[DOTOMP_STRIDE216]], i32 1, i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4 +// CHECK-NEXT: [[CMP220:%.*]] = icmp sgt i32 [[TMP0]], 99 +// CHECK-NEXT: br i1 [[CMP220]], label [[COND_TRUE221:%.*]], label [[COND_FALSE222:%.*]] +// CHECK: cond.true221: +// CHECK-NEXT: br label [[COND_END223:%.*]] +// CHECK: cond.false222: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4 +// CHECK-NEXT: br label [[COND_END223]] +// CHECK: cond.end223: +// CHECK-NEXT: [[COND224:%.*]] = phi i32 [ 99, [[COND_TRUE221]] ], [ [[TMP1]], [[COND_FALSE222]] ] +// CHECK-NEXT: store i32 [[COND224]], i32* [[DOTOMP_UB215]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB214]], align 4 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_IV212]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND225:%.*]] +// CHECK: omp.inner.for.cond225: +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4 +// CHECK-NEXT: [[CMP226:%.*]] = icmp sle i32 [[TMP3]], [[TMP4]] +// CHECK-NEXT: br i1 [[CMP226]], label [[OMP_INNER_FOR_BODY227:%.*]], label [[OMP_INNER_FOR_END236:%.*]] +// CHECK: omp.inner.for.body227: +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4 +// CHECK-NEXT: [[MUL228:%.*]] = mul nsw i32 [[TMP5]], 1 +// CHECK-NEXT: [[ADD229:%.*]] = add nsw i32 0, [[MUL228]] +// CHECK-NEXT: store i32 [[ADD229]], i32* [[I218]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK-NEXT: [[CONV230:%.*]] = sitofp i32 [[TMP6]] to double +// CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[B_ADDR]], align 8 +// CHECK-NEXT: [[ADD231:%.*]] = fadd double [[CONV230]], [[TMP7]] +// CHECK-NEXT: [[CONV232:%.*]] = fptrunc double [[ADD231]] to float +// CHECK-NEXT: [[TMP8:%.*]] = load float*, float** [[R_ADDR]], align 8 +// CHECK-NEXT: store float [[CONV232]], float* [[TMP8]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE233:%.*]] +// CHECK: omp.body.continue233: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC234:%.*]] +// CHECK: omp.inner.for.inc234: +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4 +// CHECK-NEXT: [[ADD235:%.*]] = add nsw i32 [[TMP9]], 1 +// CHECK-NEXT: store i32 [[ADD235]], i32* [[DOTOMP_IV212]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND225]] +// CHECK: omp.inner.for.end236: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT237:%.*]] +// CHECK: omp.loop.exit237: +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM238:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @43) +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @2, i32 [[OMP_GLOBAL_THREAD_NUM238]]) +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM239:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) +// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @7, i32 [[OMP_GLOBAL_THREAD_NUM239]]) +// CHECK-NEXT: ret void +// +// CHECK-DEBUG-LABEL: @_Z14parallel_for_2Pfid( +// CHECK-DEBUG-NEXT: entry: +// CHECK-DEBUG-NEXT: [[R_ADDR:%.*]] = alloca float*, align 8 +// CHECK-DEBUG-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 +// CHECK-DEBUG-NEXT: [[DOTOMP_IV212:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[TMP213:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[DOTOMP_LB214:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[DOTOMP_UB215:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[DOTOMP_STRIDE216:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[DOTOMP_IS_LAST217:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[I218:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8 +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @25), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]] +// CHECK-DEBUG: omp_parallel: +// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @25, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT211:%.*]] +// CHECK-DEBUG: omp.par.outlined.exit211: +// CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] +// CHECK-DEBUG: omp.par.exit.split: +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IV212]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_LB214]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 0, i32* [[DOTOMP_LB214]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_UB215]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 99, i32* [[DOTOMP_UB215]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_STRIDE216]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 1, i32* [[DOTOMP_STRIDE216]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IS_LAST217]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST217]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[I218]], metadata !{{[0-9]*}}, metadata !DIExpression()), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM219:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @97) +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @96, i32 [[OMP_GLOBAL_THREAD_NUM219]], i32 34, i32* [[DOTOMP_IS_LAST217]], i32* [[DOTOMP_LB214]], i32* [[DOTOMP_UB215]], i32* [[DOTOMP_STRIDE216]], i32 1, i32 1), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[CMP220:%.*]] = icmp sgt i32 [[TMP0]], 99, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br i1 [[CMP220]], label [[COND_TRUE221:%.*]], label [[COND_FALSE222:%.*]], !dbg !{{[0-9]*}} +// CHECK-DEBUG: cond.true221: +// CHECK-DEBUG-NEXT: br label [[COND_END223:%.*]], !dbg !{{[0-9]*}} +// CHECK-DEBUG: cond.false222: +// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[COND_END223]], !dbg !{{[0-9]*}} +// CHECK-DEBUG: cond.end223: +// CHECK-DEBUG-NEXT: [[COND224:%.*]] = phi i32 [ 99, [[COND_TRUE221]] ], [ [[TMP1]], [[COND_FALSE222]] ], !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 [[COND224]], i32* [[DOTOMP_UB215]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB214]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_IV212]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_INNER_FOR_COND225:%.*]], !dbg !{{[0-9]*}} +// CHECK-DEBUG: omp.inner.for.cond225: +// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[CMP226:%.*]] = icmp sle i32 [[TMP3]], [[TMP4]], !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br i1 [[CMP226]], label [[OMP_INNER_FOR_BODY227:%.*]], label [[OMP_INNER_FOR_END236:%.*]], !dbg !{{[0-9]*}} +// CHECK-DEBUG: omp.inner.for.body227: +// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[MUL228:%.*]] = mul nsw i32 [[TMP5]], 1, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[ADD229:%.*]] = add nsw i32 0, [[MUL228]], !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 [[ADD229]], i32* [[I218]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[CONV230:%.*]] = sitofp i32 [[TMP6]] to double, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = load double, double* [[B_ADDR]], align 8, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[ADD231:%.*]] = fadd double [[CONV230]], [[TMP7]], !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[CONV232:%.*]] = fptrunc double [[ADD231]] to float, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[TMP8:%.*]] = load float*, float** [[R_ADDR]], align 8, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store float [[CONV232]], float* [[TMP8]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_BODY_CONTINUE233:%.*]], !dbg !{{[0-9]*}} +// CHECK-DEBUG: omp.body.continue233: +// CHECK-DEBUG-NEXT: br label [[OMP_INNER_FOR_INC234:%.*]], !dbg !{{[0-9]*}} +// CHECK-DEBUG: omp.inner.for.inc234: +// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[ADD235:%.*]] = add nsw i32 [[TMP9]], 1, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: store i32 [[ADD235]], i32* [[DOTOMP_IV212]], align 4, !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: br label [[OMP_INNER_FOR_COND225]], !dbg !{{[0-9]*}}, !llvm.loop !{{[0-9]*}} +// CHECK-DEBUG: omp.inner.for.end236: +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_EXIT237:%.*]], !dbg !{{[0-9]*}} +// CHECK-DEBUG: omp.loop.exit237: +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM238:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @100) +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @99, i32 [[OMP_GLOBAL_THREAD_NUM238]]), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM239:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @103), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(%struct.ident_t* @102, i32 [[OMP_GLOBAL_THREAD_NUM239]]), !dbg !{{[0-9]*}} +// CHECK-DEBUG-NEXT: ret void, !dbg !{{[0-9]*}} +// +void parallel_for_2(float *r, int a, double b) { +#pragma omp parallel + { +#pragma omp for + for (int i = 0; i < 100; ++i) + *r = a + b; +#pragma omp parallel + { +#pragma omp for + for (int i = 0; i < 100; ++i) + *r = a + b; +#pragma omp parallel + { +#pragma omp for + for (int i = 0; i < 100; ++i) + *r = a + b; + } +#pragma omp for + for (int i = 0; i < 100; ++i) + *r = a + b; +#pragma omp parallel + { +#pragma omp for + for (int i = 0; i < 100; ++i) + *r = a + b; + } +#pragma omp for + for (int i = 0; i < 100; ++i) + *r = a + b; + } +#pragma omp for + for (int i = 0; i < 100; ++i) + *r = a + b; + } +#pragma omp for + for (int i = 0; i < 100; ++i) + *r = a + b; +} + +#endif diff --git a/clang/test/OpenMP/target_data_codegen.cpp b/clang/test/OpenMP/target_data_codegen.cpp index 80e674c0b49d9..a2bb8cdf5ba88 100644 --- a/clang/test/OpenMP/target_data_codegen.cpp +++ b/clang/test/OpenMP/target_data_codegen.cpp @@ -555,7 +555,7 @@ struct S2 { void test_close_modifier(int arg) { S2 *ps; - // CK5: private unnamed_addr constant [6 x i64] [i64 1059, i64 32, i64 562949953422339, i64 562949953421328, i64 16, i64 1043] + // CK5: private unnamed_addr constant [5 x i64] [i64 1059, i64 32, i64 562949953421328, i64 16, i64 1043] #pragma omp target data map(close,tofrom: arg, ps->ps->ps->ps->s) { ++(arg); @@ -634,20 +634,17 @@ void test_present_modifier(int arg) { // Make sure the struct picks up present even if another element of the struct // doesn't have present. - // CK8: private unnamed_addr constant [15 x i64] + // CK8: private unnamed_addr constant [11 x i64] // ps1 // // PRESENT=0x1000 | TARGET_PARAM=0x20 = 0x1020 // MEMBER_OF_1=0x1000000000000 | FROM=0x2 | TO=0x1 = 0x1000000000003 - // MEMBER_OF_1=0x1000000000000 | PTR_AND_OBJ=0x10 | FROM=0x2 | TO=0x1 = 0x1000000000013 - // MEMBER_OF_1=0x1000000000000 | PRESENT=0x1000 | FROM=0x2 | TO=0x1 = 0x1000000001003 // MEMBER_OF_1=0x1000000000000 | PRESENT=0x1000 | PTR_AND_OBJ=0x10 = 0x1000000001010 // PRESENT=0x1000 | PTR_AND_OBJ=0x10 = 0x1010 // PRESENT=0x1000 | PTR_AND_OBJ=0x10 | FROM=0x2 | TO=0x1 = 0x1013 // // CK8-SAME: {{^}} [i64 [[#0x1020]], i64 [[#0x1000000000003]], - // CK8-SAME: {{^}} i64 [[#0x1000000000013]], i64 [[#0x1000000001003]], // CK8-SAME: {{^}} i64 [[#0x1000000001010]], i64 [[#0x1010]], i64 [[#0x1013]], // arg @@ -659,16 +656,13 @@ void test_present_modifier(int arg) { // ps2 // // PRESENT=0x1000 | TARGET_PARAM=0x20 = 0x1020 - // MEMBER_OF_9=0x9000000000000 | PRESENT=0x1000 | FROM=0x2 | TO=0x1 = 0x9000000001003 - // MEMBER_OF_9=0x9000000000000 | PRESENT=0x1000 | PTR_AND_OBJ=0x10 | FROM=0x2 | TO=0x1 = 0x9000000001013 - // MEMBER_OF_9=0x9000000000000 | FROM=0x2 | TO=0x1 = 0x9000000000003 - // MEMBER_OF_9=0x9000000000000 | PTR_AND_OBJ=0x10 = 0x9000000000010 + // MEMBER_OF_7=0x7000000000000 | PRESENT=0x1000 | FROM=0x2 | TO=0x1 = 0x7000000001003 + // MEMBER_OF_7=0x7000000000000 | PTR_AND_OBJ=0x10 = 0x7000000000010 // PTR_AND_OBJ=0x10 = 0x10 // PTR_AND_OBJ=0x10 | FROM=0x2 | TO=0x1 = 0x13 // - // CK8-SAME: {{^}} i64 [[#0x1020]], i64 [[#0x9000000001003]], - // CK8-SAME: {{^}} i64 [[#0x9000000001013]], i64 [[#0x9000000000003]], - // CK8-SAME: {{^}} i64 [[#0x9000000000010]], i64 [[#0x10]], i64 [[#0x13]]] + // CK8-SAME: {{^}} i64 [[#0x1020]], i64 [[#0x7000000001003]], + // CK8-SAME: {{^}} i64 [[#0x7000000000010]], i64 [[#0x10]], i64 [[#0x13]]] #pragma omp target data map(tofrom: ps1->s) \ map(present,tofrom: arg, ps1->ps->ps->ps->s, ps2->s) \ map(tofrom: ps2->ps->ps->ps->s) diff --git a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp index fe6ea01b43c91..ca5536f927a10 100644 --- a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp +++ b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp @@ -22,18 +22,18 @@ double *g; // CK1: @g = global double* -// CK1: [[MTYPE00:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 19] -// CK1: [[MTYPE01:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 19] -// CK1: [[MTYPE03:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 19] -// CK1: [[MTYPE04:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 19] -// CK1: [[MTYPE05:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 19] -// CK1: [[MTYPE06:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 19] -// CK1: [[MTYPE07:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 19] -// CK1: [[MTYPE08:@.+]] = {{.*}}constant [4 x i64] [i64 99, i64 19, i64 35, i64 19] -// CK1: [[MTYPE09:@.+]] = {{.*}}constant [4 x i64] [i64 99, i64 19, i64 99, i64 19] -// CK1: [[MTYPE10:@.+]] = {{.*}}constant [4 x i64] [i64 99, i64 19, i64 99, i64 19] -// CK1: [[MTYPE11:@.+]] = {{.*}}constant [3 x i64] [i64 96, i64 35, i64 19] -// CK1: [[MTYPE12:@.+]] = {{.*}}constant [3 x i64] [i64 96, i64 35, i64 19] +// CK1: [[MTYPE00:@.+]] = {{.*}}constant [2 x i64] [i64 51, i64 96] +// CK1: [[MTYPE01:@.+]] = {{.*}}constant [1 x i64] [i64 99] +// CK1: [[MTYPE03:@.+]] = {{.*}}constant [1 x i64] [i64 99] +// CK1: [[MTYPE04:@.+]] = {{.*}}constant [1 x i64] [i64 99] +// CK1: [[MTYPE05:@.+]] = {{.*}}constant [1 x i64] [i64 99] +// CK1: [[MTYPE06:@.+]] = {{.*}}constant [1 x i64] [i64 99] +// CK1: [[MTYPE07:@.+]] = {{.*}}constant [1 x i64] [i64 99] +// CK1: [[MTYPE08:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 35] +// CK1: [[MTYPE09:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 99] +// CK1: [[MTYPE10:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 99] +// CK1: [[MTYPE11:@.+]] = {{.*}}constant [2 x i64] [i64 35, i64 96] +// CK1: [[MTYPE12:@.+]] = {{.*}}constant [2 x i64] [i64 35, i64 96] // CK1-LABEL: @_Z3foo template @@ -42,7 +42,7 @@ void foo(float *&lr, T *&tr) { T *t; // CK1: [[T:%.+]] = load double*, double** [[DECL:@g]], - // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 1 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to double** // CK1: store double* [[T]], double** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE00]] @@ -61,7 +61,7 @@ void foo(float *&lr, T *&tr) { ++g; // CK1: [[T1:%.+]] = load float*, float** [[DECL:%.+]], - // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to float** // CK1: store float* [[T1]], float** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE01]] @@ -92,7 +92,7 @@ void foo(float *&lr, T *&tr) { ++l; // CK1: [[T1:%.+]] = load float*, float** [[DECL:%.+]], - // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to float** // CK1: store float* [[T1]], float** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE03]] @@ -115,7 +115,7 @@ void foo(float *&lr, T *&tr) { // CK1: [[BTHEN]]: // CK1: [[T1:%.+]] = load float*, float** [[DECL:%.+]], - // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to float** // CK1: store float* [[T1]], float** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE04]] @@ -152,7 +152,7 @@ void foo(float *&lr, T *&tr) { // CK1: [[T2:%.+]] = load float**, float*** [[DECL:%.+]], // CK1: [[T1:%.+]] = load float*, float** [[T2]], - // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to float** // CK1: store float* [[T1]], float** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE05]] @@ -174,7 +174,7 @@ void foo(float *&lr, T *&tr) { ++lr; // CK1: [[T1:%.+]] = load i32*, i32** [[DECL:%.+]], - // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to i32** // CK1: store i32* [[T1]], i32** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE06]] @@ -194,7 +194,7 @@ void foo(float *&lr, T *&tr) { // CK1: [[T2:%.+]] = load i32**, i32*** [[DECL:%.+]], // CK1: [[T1:%.+]] = load i32*, i32** [[T2]], - // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to i32** // CK1: store i32* [[T1]], i32** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE07]] @@ -216,7 +216,7 @@ void foo(float *&lr, T *&tr) { ++tr; // CK1: [[T1:%.+]] = load float*, float** [[DECL:%.+]], - // CK1: [[BP:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* %{{.+}}, i32 0, i32 + // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to float** // CK1: store float* [[T1]], float** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE08]] @@ -280,7 +280,7 @@ void foo(float *&lr, T *&tr) { ++l; ++t; // CK1: [[T1:%.+]] = load i32*, i32** [[DECL:%.+]], - // CK1: [[BP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 1 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to i32** // CK1: store i32* [[T1]], i32** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE11]] @@ -300,7 +300,7 @@ void foo(float *&lr, T *&tr) { // CK1: [[T2:%.+]] = load i32**, i32*** [[DECL:%.+]], // CK1: [[T1:%.+]] = load i32*, i32** [[T2]], - // CK1: [[BP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* %{{.+}}, i32 0, i32 0 + // CK1: [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 1 // CK1: [[CBP:%.+]] = bitcast i8** [[BP]] to i32** // CK1: store i32* [[T1]], i32** [[CBP]], // CK1: call void @__tgt_target_data_begin{{.+}}[[MTYPE12]] @@ -348,7 +348,7 @@ void bar(float *&a, int *&b) { // CK2: [[ST:%.+]] = type { double*, double** } // CK2: [[MTYPE00:@.+]] = {{.*}}constant [2 x i64] [i64 32, i64 281474976710739] // CK2: [[MTYPE01:@.+]] = {{.*}}constant [2 x i64] [i64 32, i64 281474976710739] -// CK2: [[MTYPE02:@.+]] = {{.*}}constant [4 x i64] [i64 35, i64 19, i64 32, i64 844424930132048] +// CK2: [[MTYPE02:@.+]] = {{.*}}constant [3 x i64] [i64 35, i64 32, i64 562949953421392] // CK2: [[MTYPE03:@.+]] = {{.*}}constant [3 x i64] [i64 32, i64 281474976710739, i64 281474976710736] template @@ -404,7 +404,7 @@ struct ST { // CK2: getelementptr inbounds double, double* [[TTTT]], i32 1 b++; - // CK2: [[BP:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* %{{.+}}, i32 0, i32 3 + // CK2: [[BP:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* %{{.+}}, i32 0, i32 2 // CK2: [[CBP:%.+]] = bitcast i8** [[BP]] to double*** // CK2: store double** [[RVAL:%.+]], double*** [[CBP]], // CK2: call void @__tgt_target_data_begin{{.+}}[[MTYPE02]] diff --git a/clang/test/OpenMP/target_map_codegen.cpp b/clang/test/OpenMP/target_map_codegen.cpp index e63f19ad73c0a..a394f5b1c3d53 100644 --- a/clang/test/OpenMP/target_map_codegen.cpp +++ b/clang/test/OpenMP/target_map_codegen.cpp @@ -3874,7 +3874,7 @@ int explicit_maps_template_args_and_members(int a){ // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0 // CK22: [[SIZE04:@.+]] = private {{.*}}constant [1 x i64] [i64 20] -// CK22: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i64] [i64 35] +// CK22: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i64] [i64 51] // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0 // CK22: [[SIZE05:@.+]] = private {{.*}}constant [1 x i64] [i64 4] @@ -3894,7 +3894,7 @@ int explicit_maps_template_args_and_members(int a){ // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0 // CK22: [[SIZE09:@.+]] = private {{.*}}constant [1 x i64] [i64 20] -// CK22: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i64] [i64 35] +// CK22: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i64] [i64 51] // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0 // CK22: [[SIZE10:@.+]] = private {{.*}}constant [1 x i64] [i64 4] @@ -3914,7 +3914,7 @@ int explicit_maps_template_args_and_members(int a){ // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0 // CK22: [[SIZE14:@.+]] = private {{.*}}constant [1 x i64] [i64 20] -// CK22: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i64] [i64 35] +// CK22: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i64] [i64 51] int a; int c[100]; @@ -4010,11 +4010,10 @@ int explicit_maps_globals(void){ // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 - // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to i32** + // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to i32*** // CK22-DAG: [[CP0:%.+]] = bitcast i8** [[P0]] to i32** - // CK22-DAG: store i32* [[RVAR0:%.+]], i32** [[CBP0]] + // CK22-DAG: store i32** @d, i32*** [[CBP0]] // CK22-DAG: store i32* [[SEC0:%.+]], i32** [[CP0]] - // CK22-DAG: [[RVAR0]] = load i32*, i32** @d // CK22-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 2 // CK22-DAG: [[RVAR00]] = load i32*, i32** @d @@ -4093,11 +4092,10 @@ int explicit_maps_globals(void){ // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 - // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to [[ST]]** + // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to [[ST]]*** // CK22-DAG: [[CP0:%.+]] = bitcast i8** [[P0]] to [[ST]]** - // CK22-DAG: store [[ST]]* [[RVAR0:%.+]], [[ST]]** [[CBP0]] + // CK22-DAG: store [[ST]]** @sd, [[ST]]*** [[CBP0]] // CK22-DAG: store [[ST]]* [[SEC0:%.+]], [[ST]]** [[CP0]] - // CK22-DAG: [[RVAR0]] = load [[ST]]*, [[ST]]** @sd // CK22-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[RVAR00:%.+]], i{{.+}} 2 // CK22-DAG: [[RVAR00]] = load [[ST]]*, [[ST]]** @sd @@ -4176,11 +4174,10 @@ int explicit_maps_globals(void){ // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 - // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to [[STT]]** + // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to [[STT]]*** // CK22-DAG: [[CP0:%.+]] = bitcast i8** [[P0]] to [[STT]]** - // CK22-DAG: store [[STT]]* [[RVAR0:%.+]], [[STT]]** [[CBP0]] + // CK22-DAG: store [[STT]]** @std, [[STT]]*** [[CBP0]] // CK22-DAG: store [[STT]]* [[SEC0:%.+]], [[STT]]** [[CP0]] - // CK22-DAG: [[RVAR0]] = load [[STT]]*, [[STT]]** @std // CK22-DAG: [[SEC0]] = getelementptr {{.*}}[[STT]]* [[RVAR00:%.+]], i{{.+}} 2 // CK22-DAG: [[RVAR00]] = load [[STT]]*, [[STT]]** @std diff --git a/clang/test/OpenMP/target_update_ast_print.cpp b/clang/test/OpenMP/target_update_ast_print.cpp index 0111432fde8f6..745233c430eb1 100644 --- a/clang/test/OpenMP/target_update_ast_print.cpp +++ b/clang/test/OpenMP/target_update_ast_print.cpp @@ -5,6 +5,14 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s + +// RUN: %clang_cc1 -DOMP51 -verify -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s +// RUN: %clang_cc1 -DOMP51 -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -DOMP51 -fopenmp -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s + +// RUN: %clang_cc1 -DOMP51 -verify -fopenmp-simd -fopenmp-version=51 -ast-print %s | FileCheck -check-prefixes=CHECK,OMP51 %s +// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -DOMP51 -fopenmp-simd -fopenmp-version=51 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck -check-prefixes=CHECK,OMP51 %s // expected-no-diagnostics #ifndef HEADER @@ -25,6 +33,13 @@ T foo(T targ, U uarg) { #pragma omp target update to(arr[2][0:1:2]) #pragma omp target update from(arr[2][0:1:2]) + +#ifdef OMP51 +#pragma omp target update to(present: arr[2][0:1:2]) + +#pragma omp target update from(present: arr[2][0:1:2], a) +#endif + return a + targ + (T)b; } // CHECK: static T a, *p; @@ -45,6 +60,8 @@ T foo(T targ, U uarg) { // CHECK: int arr[100][100]; // CHECK-NEXT: #pragma omp target update to(arr[2][0:1:2]) // CHECK-NEXT: #pragma omp target update from(arr[2][0:1:2]) +// OMP5-NEXT: #pragma omp target update to(present: arr[2][0:1:2]) +// OMP5-NEXT: #pragma omp target update from(present: arr[2][0:1:2], a) int main(int argc, char **argv) { static int a; @@ -62,6 +79,13 @@ int main(int argc, char **argv) { // CHECK-NEXT: #pragma omp target update to(argv[2][0:1:2]) #pragma omp target update from(argv[2][0:1:2]) // CHECK-NEXT: #pragma omp target update from(argv[2][0:1:2]) +#ifdef OMP51 +#pragma omp target update to(present: argv[2][0:1:2]) +// OMP5-NEXT: #pragma omp target update to(present: arr[2][0:1:2]) +#pragma omp target update from(argv[2][0:1:2], a) +// OMP5-NEXT: #pragma omp target update from(present: arr[2][0:1:2], a) +#endif + return foo(argc, f) + foo(argv[0][0], f) + a; } diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp index 637b580d754f2..9eab8b3367c4e 100644 --- a/clang/test/OpenMP/target_update_codegen.cpp +++ b/clang/test/OpenMP/target_update_codegen.cpp @@ -310,22 +310,23 @@ void device_side_scan(int arg) { #ifdef CK5 -// CK5: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 4] -// CK5: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 33, i64 17] +// CK5: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4] +// CK5: [[MTYPE00:@.+]] = {{.+}}constant [1 x i64] [i64 33] // CK5-LABEL: lvalue void lvalue(int *B, int l, int e) { - // CK5-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK5-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK5-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK5-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] - // CK5-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK5-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK5-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32*** + // CK5-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK5-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK5-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32** // CK5-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to i32** - // CK5-DAG: store i32** [[B_ADDR:%.+]], i32*** [[BPC0]] + // CK5-DAG: store i32* [[B_VAL:%.+]], i32** [[BPC0]] // CK5-DAG: store i32* [[B_VAL_2:%.+]], i32** [[PC0]] + // CK5-DAG: [[B_VAL]] = load i32*, i32** [[B_ADDR:%.+]] // CK5-DAG: [[B_VAL_2]] = load i32*, i32** [[B_ADDR]] #pragma omp target update to(*B) *B += e; @@ -351,28 +352,29 @@ void lvalue(int *B, int l, int e) { #ifdef CK6 -// CK6: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 4] -// CK6: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 33, i64 17] +// CK6: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4] +// CK6: [[MTYPE00:@.+]] = {{.+}}constant [1 x i64] [i64 33] // CK6-LABEL: lvalue void lvalue(int *B, int l, int e) { - // CK6-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK6-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK6-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK6-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] - // CK6-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK6-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK6-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32*** + // CK6-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK6-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK6-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32** // CK6-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to i32** - // CK6-DAG: store i32** [[B_ADDR:%.+]], i32*** [[BPC0]] + // CK6-DAG: store i32* [[TWO:%.+]], i32** [[BPC0]] // CK6-DAG: store i32* [[ADD_PTR:%.+]], i32** [[PC0]] // CK6-64-DAG: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[ONE:%.+]], i{{32|64}} [[IDX_EXT:%.+]] // CK6-32-DAG: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[ONE:%.+]], i{{32|64}} [[L_VAL:%.+]] // CK6-64-DAG: [[IDX_EXT]] = sext i32 [[L_VAL:%.+]] to i64 // CK6-DAG: [[L_VAL]] = load i32, i32* [[L_ADDR:%.+]] // CK6-DAG: store i32 {{.+}}, i32* [[L_ADDR]] - // CK6-DAG: [[ONE]] = load i32*, i32** [[B_ADDR]] + // CK6-DAG: [[ONE]] = load i32*, i32** [[B_ADDR:%.+]] + // CK6-DAG: [[TWO]] = load i32*, i32** [[B_ADDR]] #pragma omp target update to(*(B+l)) *(B+l) += e; #pragma omp target update from(*(B+l)) @@ -397,25 +399,26 @@ void lvalue(int *B, int l, int e) { #ifdef CK7 -// CK7: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 4] -// CK7: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 33, i64 17] +// CK7: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4] +// CK7: [[MTYPE00:@.+]] = {{.+}}constant [1 x i64] [i64 33] // CK7-LABEL: lvalue void lvalue(int *B, int l, int e) { - // CK7-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK7-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK7-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK7-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] - // CK7-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK7-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK7-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32*** + // CK7-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK7-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK7-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32** // CK7-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to i32** - // CK7-DAG: store i32** [[B_ADDR:%.+]], i32*** [[BPC0]] + // CK7-DAG: store i32* [[B_VAL:%.+]], i32** [[BPC0]] // CK7-DAG: store i32* [[ARRAY_IDX:%.+]], i32** [[PC0]] // CK7-DAG: [[ARRAY_IDX]] = getelementptr inbounds i32, i32* [[ADD_PTR:%.+]], i{{32|64}} [[IDX_PROM:%.+]] // CK7-64-DAG: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[ONE:%.+]], i64 [[IDX_EXT:%.+]] // CK7-32-DAG: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[B_VAL_2:%.+]], i32 [[L_VAL:%.+]] + // CK7-32-DAG: [[B_VAL]] = load i32*, i32** [[B_ADDR:%.+]] // CK7-32-DAG: [[B_VAL_2]] = load i32*, i32** [[B_ADDR]] // CK7-32-DAG: [[L_VAL]] = load i32, i32* [[L_ADDR:%.+]] // CK7-32-DAG: [[IDX_PROM]] = load i32, i32* [[L_ADDR]] @@ -446,18 +449,18 @@ void lvalue(int *B, int l, int e) { // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} #ifdef CK8 -// CK8: [[SIZE00:@.+]] = {{.+}}constant [3 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} {{8|4}}, i{{64|32}} 4] -// CK8: [[MTYPE00:@.+]] = {{.+}}constant [3 x i64] [i64 33, i64 16, i64 17] +// CK8: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 4] +// CK8: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 33, i64 17] // CK8-LABEL: lvalue void lvalue(int **B, int l, int e) { - // CK8-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}], [3 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}, [3 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK8-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}], [2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}, [2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK8-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK8-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] - // CK8-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2 - // CK8-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2 + // CK8-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 + // CK8-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 // CK8-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32*** // CK8-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to i32** // CK8-DAG: store i32** [[ARRAY_IDX_1:%.+]], i32*** [[BPC0]] @@ -501,19 +504,19 @@ struct S { double *p; }; -// CK9: [[MTYPE00:@.+]] = {{.+}}constant [3 x i64] [i64 32, i64 281474976710657, i64 281474976710673] +// CK9: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 32, i64 281474976710673] // CK9-LABEL: lvalue void lvalue(struct S *s, int l, int e) { - // CK9-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i{{.+}} [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}, [3 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK9-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i{{.+}} [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}, [2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK9-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK9-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] // CK9-DAG: [[GSIZE]] = getelementptr inbounds {{.+}}[[SIZE:%[^,]+]] // - // CK9-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2 - // CK9-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2 - // CK9-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 2 + // CK9-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 + // CK9-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 + // CK9-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 1 // CK9-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to double*** // CK9-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to double** // CK9-DAG: store double** [[P:%.+]], double*** [[BPC0]] @@ -551,19 +554,19 @@ struct S { double *p; }; -// CK10: [[MTYPE00:@.+]] = {{.+}}constant [3 x i64] [i64 32, i64 281474976710657, i64 281474976710673] +// CK10: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 32, i64 281474976710673] // CK10-LABEL: lvalue void lvalue(struct S *s, int l, int e) { - // CK10-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i{{.+}} [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}, [3 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK10-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i{{.+}} [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}, [2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK10-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK10-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] // CK10-DAG: [[GSIZE]] = getelementptr inbounds {{.+}}[[SIZE:%[^,]+]] // - // CK10-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2 - // CK10-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2 - // CK10-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 2 + // CK10-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 + // CK10-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 + // CK10-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 1 // CK10-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to double*** // CK10-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to double** // CK10-DAG: store double** [[P_VAL:%.+]], double*** [[BPC0]] @@ -601,19 +604,19 @@ void lvalue(struct S *s, int l, int e) { struct S { double *p; }; -// CK11: [[MTYPE00:@.+]] = {{.+}}constant [3 x i64] [i64 32, i64 281474976710657, i64 281474976710673] +// CK11: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 32, i64 281474976710673] // CK11-LABEL: lvalue void lvalue(struct S *s, int l, int e) { - // CK11-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i{{.+}} [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}, [3 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK11-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i{{.+}} [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}, [2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK11-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK11-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] // CK11-DAG: [[GSIZE]] = getelementptr inbounds {{.+}}[[SIZE:%[^,]+]] // - // CK11-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2 - // CK11-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2 - // CK11-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 2 + // CK11-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 + // CK11-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 + // CK11-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 1 // CK11-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to double*** // CK11-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to double** // CK11-DAG: store double** [[P:%.+]], double*** [[BPC0]] @@ -653,41 +656,44 @@ struct S { double *p; struct S *sp; }; -// CK12: [[MTYPE00:@.+]] = {{.+}}constant [4 x i64] [i64 32, i64 281474976710657, i64 281474976710672, i64 17] +// CK12: [[MTYPE00:@.+]] = {{.+}}constant [3 x i64] [i64 32, i64 281474976710672, i64 17] // CK12-LABEL: lvalue void lvalue(struct S *s, int l, int e) { - // CK12-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i{{.+}} [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}, [4 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK12-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i{{.+}} [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}, [3 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK12-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK12-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] // CK12-DAG: [[GSIZE]] = getelementptr inbounds {{.+}}[[SIZE:%[^,]+]] // - // CK12-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3 - // CK12-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3 - // CK12-DAG: [[SIZE2:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 3 + // CK12-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2 + // CK12-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2 + // CK12-DAG: [[SIZE2:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 2 // CK12-DAG: [[BPC2:%.+]] = bitcast i8** [[BP2]] to double*** // CK12-DAG: [[PC2:%.+]] = bitcast i8** [[P2]] to double** // CK12-DAG: store double** [[P_VAL:%.+]], double*** [[BPC2]] // CK12-DAG: store double* [[SIX:%.+]], double** [[PC2]] // CK12-DAG: store i{{.+}} 8, i{{.+}}* [[SIZE2]] - // CK12-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2 - // CK12-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2 - // CK12-DAG: [[SIZE1:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 2 + // CK12-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 + // CK12-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 + // CK12-DAG: [[SIZE1:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 1 // CK12-DAG: [[BPC1:%.+]] = bitcast i8** [[BP1]] to [[STRUCT_S:%.+]]*** // CK12-DAG: [[PC1:%.+]] = bitcast i8** [[P1]] to double*** // CK12-DAG: store [[STRUCT_S]]** [[SP:%.+]], [[STRUCT_S]]*** [[BPC1]] // CK12-DAG: store double** [[P_VAL:%.+]], double*** [[PC1]] // CK12-DAG: store i{{.+}} {{4|8}}, i{{.+}}* [[SIZE1]] - // CK12-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK12-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK12-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 1 + // CK12-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK12-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK12-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 0 // CK12-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to [[STRUCT_S:%.+]]** // CK12-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to [[STRUCT_S]]*** + // CK12-DAG: store [[STRUCT_S]]* [[ZERO:%.+]], [[STRUCT_S]]** [[BPC0]] + // CK12-DAG: store [[STRUCT_S]]** [[SP]], [[STRUCT_S]]*** [[PC0]] // CK12-DAG: store [[STRUCT_S]]** [[S:%.+]], [[STRUCT_S]]*** [[S_VAL:%.+]] // CK12-DAG: store i{{.+}} {{.+}}, i{{.+}}* [[SIZE0]] // CK12-DAG: [[SP]] = getelementptr inbounds [[STRUCT_S]], [[STRUCT_S]]* [[ONE:%.+]], i32 0, i32 1 - // CK12-DAG: [[ONE]] = load %struct.S*, %struct.S** [[S]], + // CK12-DAG: [[ONE]] = load [[STRUCT_S]]*, [[STRUCT_S]]** [[S:%.+]], + // CK12-DAG: [[ZERO]] = load [[STRUCT_S]]*, [[STRUCT_S]]** [[S]], #pragma omp target update to(*(s->sp->p)) *(s->sp->p) = e; #pragma omp target update from(*(s->sp->p)) @@ -711,21 +717,21 @@ void lvalue(struct S *s, int l, int e) { // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} #ifdef CK13 -// CK13: [[SIZE00:@.+]] = {{.+}}constant [2 x i64] [i64 {{8|4}}, i64 4] -// CK13: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 33, i64 17] +// CK13: [[SIZE00:@.+]] = {{.+}}constant [1 x i64] [i64 4] +// CK13: [[MTYPE00:@.+]] = {{.+}}constant [1 x i64] [i64 33] // CK13-LABEL: lvalue void lvalue(int **BB, int a, int b) { - // CK13-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK13-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK13-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK13-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] - // CK13-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK13-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK13-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32**** + // CK13-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK13-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK13-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to i32*** // CK13-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to i32** - // CK13-DAG: store i32*** [[BB_ADDR:%.+]], i32**** [[BPC0]] + // CK13-DAG: store i32** [[B_VAL1:%.+]], i32*** [[BPC0]] // CK13-DAG: store i32* [[ADD_PTR_2:%.+]], i32** [[PC0]] // CK13-64-DAG: [[ADD_PTR_2]] = getelementptr inbounds i32, i32* [[RESULT:%.+]], i64 [[IDX_EXT_1:%.+]] // CK13-32-DAG: [[ADD_PTR_2]] = getelementptr inbounds i32, i32* [[RESULT:%.+]], i32 [[B_ADDR:%.+]] @@ -734,7 +740,8 @@ void lvalue(int **BB, int a, int b) { // CK13-64-DAG: [[ADD_PTR]] = getelementptr inbounds i32*, i32** [[B_VAL:%.+]], i64 [[IDX_EXT:%.+]] // CK13-32-DAG: [[ADD_PTR]] = getelementptr inbounds i32*, i32** [[B_VAL:%.+]], i32 [[A_ADDR:%.+]] // CK13-64-DAG: [[IDX_EXT]] = sext i32 [[TWO:%.+]] to i64 - // CK13-DAG: [[B_VAL]] = load i32**, i32*** [[BB_ADDR]] + // CK13-DAG: [[B_VAL]] = load i32**, i32*** [[BB_ADDR:%.+]] + // CK13-DAG: [[B_VAL1]] = load i32**, i32*** [[BB_ADDR]] #pragma omp target update to(*(*(BB+a)+b)) *(*(BB+a)+b) = 1; #pragma omp target update from(*(*(BB+a)+b)) @@ -831,7 +838,7 @@ void lvalue_member(SSA *sap) { // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} #ifdef CK15 -// CK15: [[MTYPE00:@.+]] = {{.+}}constant [3 x i64] [i64 32, i64 281474976710657, i64 281474976710673] +// CK15: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 32, i64 281474976710673] struct SSA { double *p; @@ -842,36 +849,27 @@ struct SSA { //CK-15-LABEL: lvalue_member void lvalue_member(SSA *sap) { - // CK15-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK15-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GSIZE:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK15-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK15-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] // CK15-DAG: [[GSIZE]] = getelementptr inbounds {{.+}}[[SIZE:%[^,]+]] - // CK15-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2 - // CK15-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2 - // CK15-DAG: [[SIZE2:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 2 - // CK15-DAG: [[BPC2:%.+]] = bitcast i8** [[BP2]] to double*** - // CK15-DAG: [[PC2:%.+]] = bitcast i8** [[P2]] to double** - // CK15-DAG: store double** [[P_VAL:%.+]], double*** [[BPC2]] - // CK15-DAG: store double* [[ADD_PTR:%.+]], double** [[PC2]] - // CK15-DAG: store i64 8, i64* [[SIZE2]] - // CK15-DAG: [[ADD_PTR]] = getelementptr inbounds double, double* [[THREE:%.+]], i{{.+}} 3 - // CK15-DAG: [[THREE]] = load double*, double** [[P_VAL_1:%.+]] - // CK15-DAG: [[P_VAL]] = getelementptr inbounds [[SSA:%.+]], [[SSA:%.+]]* [[THIS:%.+]], i32 0, i32 0 // CK15-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 // CK15-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 // CK15-DAG: [[SIZE1:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 1 - // CK15-DAG: [[BPC1:%.+]] = bitcast i8** [[BP1]] to [[SSA]]** - // CK15-DAG: [[PC1:%.+]] = bitcast i8** [[P1]] to [[SSA]]*** - // CK15-DAG: store [[SSA]]* [[SAP_VAL:%.+]], [[SSA]]** [[BPC1]], - // CK15-DAG: store [[SSA]]** [[SAP_ADDR:%.+]], [[SSA]]*** [[PC1]] - // CK15-DAG: store i{{.+}} {{8|4}}, i{{.+}}* [[SIZE1]] - // CK15-DAG: [[SAP_VAL]] = load [[SSA]]*, [[SSA]]** [[SAP_ADDR]], + // CK15-DAG: [[BPC1:%.+]] = bitcast i8** [[BP1]] to double*** + // CK15-DAG: [[PC1:%.+]] = bitcast i8** [[P1]] to double** + // CK15-DAG: store double** [[P_VAL:%.+]], double*** [[BPC1]] + // CK15-DAG: store double* [[ADD_PTR:%.+]], double** [[PC1]] + // CK15-DAG: store i64 {{4|8}}, i64* [[SIZE1]] + // CK15-DAG: [[ADD_PTR]] = getelementptr inbounds double, double* [[THREE:%.+]], i{{.+}} 3 + // CK15-DAG: [[THREE]] = load double*, double** [[P_VAL_1:%.+]] + // CK15-DAG: [[P_VAL]] = getelementptr inbounds [[SSA:%.+]], [[SSA:%.+]]* [[THIS:%.+]], i32 0, i32 0 // CK15-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 // CK15-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 // CK15-DAG: [[SIZE0:%.+]] = getelementptr inbounds {{.+}}[[SIZE]], i{{.+}} 0, i{{.+}} 0 - // CK15-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to [[SSA]]*** - // CK15-DAG: store [[SSA]]** [[SAP_ADDR]], [[SSA]]*** [[BPC0]], + // CK15-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to [[SSA]]** + // CK15-DAG: store [[SSA]]* [[ZERO:%.+]], [[SSA]]** [[BPC0]], // CK15-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to double*** // CK15-DAG: store double** [[P_VAL]], double*** [[PC0]], // CK15-DAG: store i{{.+}} [[COMPUTE_SIZE:%.+]], i{{.+}}* [[SIZE0]] @@ -881,6 +879,7 @@ void lvalue_member(SSA *sap) { // CK15-DAG: [[EIGHT]] = ptrtoint i8* [[FIVE:%.+]] to i64 // CK15-DAG: [[SIX]] = bitcast double** {{.+}} to i8* // CK15-DAG: [[FIVE]] = bitcast double** {{.+}} to i8* + // CK15-DAG: [[ZERO]] = load [[SSA]]*, [[SSA]]** %{{.+}}, #pragma omp target update to(*(3+sap->p)) *(3+sap->p) = 1; #pragma omp target update from(*(3+sap->p)) @@ -904,25 +903,26 @@ void lvalue_member(SSA *sap) { // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} #ifdef CK16 -// CK16: [[SIZE00:@.+]] = {{.+}}constant [2 x i64] [i64 {{8|4}}, i64 4] -// CK16: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 33, i64 17] +// CK16: [[SIZE00:@.+]] = {{.+}}constant [1 x i64] [i64 4] +// CK16: [[MTYPE00:@.+]] = {{.+}}constant [1 x i64] [i64 33] //CK16-LABEL: lvalue_find_base void lvalue_find_base(float *f, int *i) { - // CK16-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK16-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK16-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK16-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] - // CK16-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK16-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK16-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float*** + // CK16-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK16-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK16-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float** // CK16-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float** - // CK16-DAG: store float** [[F_ADDR:%.+]], float*** [[BPC0]] + // CK16-DAG: store float* [[F:%.+]], float** [[BPC0]] // CK16-DAG: store float* [[ADD_PTR:%.+]], float** [[PC0]] // CK16-32-DAG: [[ADD_PTR]] = getelementptr inbounds float, float* [[THREE:%.+]], i32 [[I:%.+]] // CK16-64-DAG: [[ADD_PTR]] = getelementptr inbounds float, float* [[THREE:%.+]], i64 [[IDX_EXT:%.+]] - // CK16-DAG: [[THREE]] = load float*, float** [[F_ADDR]], + // CK16-DAG: [[THREE]] = load float*, float** [[F_ADDR:%.+]], + // CK16-DAG: [[F]] = load float*, float** [[F_ADDR]], // CK16-64-DAG: [[IDX_EXT]] = sext i32 [[I:%.+]] to i64 #pragma omp target update to(*(*i+f)) @@ -948,8 +948,8 @@ void lvalue_find_base(float *f, int *i) { // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} #ifdef CK17 -// CK17: [[SIZE00:@.+]] = {{.+}}constant [2 x i64] [i64 {{4|8}}, i64 4] -// CK17: [[MTYPE00:@.+]] = {{.+}}constant [2 x i64] [i64 33, i64 17] +// CK17: [[SIZE00:@.+]] = {{.+}}constant [1 x i64] [i64 4] +// CK17: [[MTYPE00:@.+]] = {{.+}}constant [1 x i64] [i64 33] struct SSA { int i; @@ -959,15 +959,15 @@ struct SSA { //CK17-LABEL: lvalue_find_base void lvalue_find_base(float **f, SSA *sa) { - // CK17-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK17-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) // CK17-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK17-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] - // CK17-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK17-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK17-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float**** + // CK17-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK17-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK17-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float*** // CK17-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float** - // CK17-DAG: store float*** [[F_ADDR:%.+]], float**** [[BPC0]], + // CK17-DAG: store float** [[F_VAL:%.+]], float*** [[BPC0]], // CK17-DAG: store float* [[ADD_PTR_4:%.+]], float** [[PC0]], // CK17-64-DAG: [[ADD_PTR_4]] = getelementptr inbounds float, float* [[SEVEN:%.+]], i64 [[IDX_EXT_3:%.+]] // CK17-64-DAG: [[IDX_EXT_3]] = sext i32 [[I_VAL:%.+]] to i64 @@ -981,6 +981,8 @@ void lvalue_find_base(float **f, SSA *sa) { // CK17-DAG: [[FIVE]] = load i32, i32* [[I_2:%.+]], // CK17-DAG: [[I_2]] = getelementptr inbounds [[SSA:%.+]], [[SSA]]* [[FOUR:%.+]], i32 0, i32 0 // CK17-DAG: [[FOUR]] = load [[SSA]]*, [[SSA]]** [[SSA_ADDR:%.+]], + // CK17-DAG: [[F]] = load float**, float*** [[F_ADDR:%.+]], + // CK17-DAG: [[F_VAL]] = load float**, float*** [[F_ADDR]], #pragma omp target update to(*(sa->sa->i+*(1+sa->i+f))) *(sa->sa->i+*(1+sa->i+f)) = 1; @@ -1005,13 +1007,13 @@ void lvalue_find_base(float **f, SSA *sa) { // SIMD-ONLY18-NOT: {{__kmpc|__tgt}} #ifdef CK18 -// CK18-DAG: [[MTYPE_TO:@.+]] = {{.+}}constant [2 x i64] [i64 33, i64 16] -// CK18-DAG: [[MTYPE_FROM:@.+]] = {{.+}}constant [2 x i64] [i64 34, i64 16] +// CK18-DAG: [[MTYPE_TO:@.+]] = {{.+}}constant [1 x i64] [i64 33] +// CK18-DAG: [[MTYPE_FROM:@.+]] = {{.+}}constant [1 x i64] [i64 34] //CK18-LABEL: array_shaping void array_shaping(float *f, int sa) { - // CK18-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE_TO]]{{.+}}, i8** null) + // CK18-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE_TO]]{{.+}}, i8** null) // CK18-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK18-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] // CK18-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]] @@ -1021,23 +1023,12 @@ void array_shaping(float *f, int sa) { // CK18-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0 // CK18-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float** - // CK18-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float*** + // CK18-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float** // CK18-DAG: store float* [[F1:%.+]], float** [[BPC0]], - // CK18-DAG: store float** [[F_ADDR:%.+]], float*** [[PC0]], - // CK18-DAG: store i64 {{8|4}}, i64* [[S0]], - // CK18-DAG: [[F1]] = load float*, float** [[F_ADDR]], - - // CK18-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK18-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK18-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1 - - // CK18-DAG: [[BPC1:%.+]] = bitcast i8** [[BP1]] to float*** - // CK18-DAG: [[PC1:%.+]] = bitcast i8** [[P1]] to float** - - // CK18-DAG: store float** [[F_ADDR]], float*** [[BPC1]], - // CK18-DAG: store float* [[F2:%.+]], float** [[PC1]], - // CK18-DAG: store i64 [[SIZE:%.+]], i64* [[S1]], + // CK18-DAG: store float* [[F2:%.+]], float** [[PC0]], + // CK18-DAG: store i64 [[SIZE:%.+]], i64* [[S0]], + // CK18-DAG: [[F1]] = load float*, float** [[F_ADDR:%.+]], // CK18-DAG: [[F2]] = load float*, float** [[F_ADDR]], // CK18-64-DAG: [[SIZE]] = mul nuw i64 [[SZ1:%.+]], 4 @@ -1047,7 +1038,7 @@ void array_shaping(float *f, int sa) { // CK18-32-DAG: [[SZ2]] = mul nuw i32 12, %{{.+}} #pragma omp target update to(([3][sa][4])f) sa = 1; - // CK18-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE_FROM]]{{.+}}, i8** null) + // CK18-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE_FROM]]{{.+}}, i8** null) // CK18-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] // CK18-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] // CK18-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]] @@ -1057,23 +1048,12 @@ void array_shaping(float *f, int sa) { // CK18-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0 // CK18-DAG: [[BPC0:%.+]] = bitcast i8** [[BP0]] to float** - // CK18-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float*** + // CK18-DAG: [[PC0:%.+]] = bitcast i8** [[P0]] to float** // CK18-DAG: store float* [[F1:%.+]], float** [[BPC0]], - // CK18-DAG: store float** [[F_ADDR:%.+]], float*** [[PC0]], - // CK18-DAG: store i64 {{8|4}}, i64* [[S0]], + // CK18-DAG: store float* [[F2:%.+]], float** [[PC0]], + // CK18-DAG: store i64 [[SIZE:%.+]], i64* [[S0]], // CK18-DAG: [[F1]] = load float*, float** [[F_ADDR]], - - // CK18-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1 - // CK18-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1 - // CK18-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1 - - // CK18-DAG: [[BPC1:%.+]] = bitcast i8** [[BP1]] to float*** - // CK18-DAG: [[PC1:%.+]] = bitcast i8** [[P1]] to float** - - // CK18-DAG: store float** [[F_ADDR]], float*** [[BPC1]], - // CK18-DAG: store float* [[F2:%.+]], float** [[PC1]], - // CK18-DAG: store i64 [[SIZE:%.+]], i64* [[S1]], // CK18-DAG: [[F2]] = load float*, float** [[F_ADDR]], // CK18-64-DAG: [[SIZE]] = mul nuw i64 [[SZ1:%.+]], 5 @@ -1084,5 +1064,68 @@ void array_shaping(float *f, int sa) { #pragma omp target update from(([sa][5])f) } +#endif +///==========================================================================/// +// RUN: %clang_cc1 -DCK19 -verify -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK19 --check-prefix CK19-64 +// RUN: %clang_cc1 -DCK19 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK19 --check-prefix CK19-64 +// RUN: %clang_cc1 -DCK19 -verify -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK19 --check-prefix CK19-32 +// RUN: %clang_cc1 -DCK19 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK19 --check-prefix CK19-32 + +// RUN: %clang_cc1 -DCK19 -verify -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -DCK19 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -DCK19 -verify -fopenmp-version=51 -fopenmp-simd -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -DCK19 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} +#ifdef CK19 + +// PRESENT=0x1000 | TARGET_PARAM=0x20 | TO=0x1 = 0x1021 +// CK19: [[MTYPE00:@.+]] = {{.+}}constant [1 x i64] [i64 [[#0x1021]]] + +// PRESENT=0x1000 | TARGET_PARAM=0x20 | FROM=0x2 = 0x1022 +// CK19: [[MTYPE01:@.+]] = {{.+}}constant [1 x i64] [i64 [[#0x1022]]] + +// CK19-LABEL: _Z13check_presenti +void check_present(int arg) { + int la; + float lb[arg]; + + // Region 00 + // CK19-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}}, i8** null) + // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] + // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] + // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]] + + // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0 + // CK19-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to float** + // CK19-DAG: [[CP0:%.+]] = bitcast i8** [[P0]] to float** + // CK19-DAG: store float* [[VAL0:%[^,]+]], float** [[CBP0]] + // CK19-DAG: store float* [[VAL0]], float** [[CP0]] + // CK19-DAG: store i64 [[CSVAL0:%[^,]+]], i64* [[S0]] + #pragma omp target update to(present: lb) + ; + + // Region 01 + // CK19-DAG: call void @__tgt_target_data_update_mapper(i64 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i64* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}}, i8** null) + // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]] + // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]] + // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]] + + // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0 + // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0 + // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0 + // CK19-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to float** + // CK19-DAG: [[CP0:%.+]] = bitcast i8** [[P0]] to float** + // CK19-DAG: store float* [[VAL0:%[^,]+]], float** [[CBP0]] + // CK19-DAG: store float* [[VAL0]], float** [[CP0]] + // CK19-DAG: store i64 [[CSVAL0:%[^,]+]], i64* [[S0]] + #pragma omp target update from(present: lb) + ; +} #endif #endif diff --git a/clang/test/OpenMP/target_update_messages.cpp b/clang/test/OpenMP/target_update_messages.cpp index 4092b623bdc97..401941b63028c 100644 --- a/clang/test/OpenMP/target_update_messages.cpp +++ b/clang/test/OpenMP/target_update_messages.cpp @@ -1,8 +1,10 @@ -// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,lt50,lt51 -fopenmp -fopenmp-version=45 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,ge50,lt51 -fopenmp -fopenmp-version=50 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,ge50,ge51 -fopenmp -fopenmp-version=51 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized -// RUN: %clang_cc1 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,lt50,lt51 -fopenmp-simd -fopenmp-version=45 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,ge50,lt51 -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized +// RUN: %clang_cc1 -verify=expected,ge50,ge51 -fopenmp-simd -fopenmp-version=51 -ferror-limit 100 -o - -std=c++11 %s -Wuninitialized void xxx(int argc) { int x; // expected-note {{initialize the variable 'x' to silence this warning}} @@ -25,6 +27,10 @@ int tmain(T argc, S **argv) { return 0; } +struct S { + int i; +}; + int main(int argc, char **argv) { int m; #pragma omp target update // expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} @@ -34,15 +40,114 @@ int main(int argc, char **argv) { #pragma omp target update to(m) ] // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} #pragma omp target update to(m) ) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} + #pragma omp declare mapper(id: S s) map(s.i) + S s; + + // Check parsing with no modifiers. + // lt51-error@+2 {{expected expression}} + // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(: s) + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(:) + // expected-error@+2 2 {{expected expression}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(,:) + + // Check parsing with one modifier. + // expected-error@+2 {{use of undeclared identifier 'foobar'}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(foobar: s) + // expected-error@+3 {{expected ',' or ')' in 'to' clause}} + // expected-error@+2 {{expected ')'}} + // expected-note@+1 {{to match this '('}} + #pragma omp target update to(m: s) + #pragma omp target update to(mapper(id): s) + // lt51-error@+2 {{use of undeclared identifier 'present'}} + // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present: s) + // ge51-warning@+4 {{missing ':' after motion modifier - ignoring}} + // lt51-warning@+3 {{missing ':' after ) - ignoring}} + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(mapper(id) s) + // ge51-warning@+4 {{missing ':' after motion modifier - ignoring}} + // ge51-error@+3 {{expected expression}} + // lt51-error@+2 {{use of undeclared identifier 'present'}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present s) + // ge51-warning@+4 {{missing ':' after motion modifier - ignoring}} + // lt51-warning@+3 {{missing ':' after ) - ignoring}} + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(mapper(id)) + // ge51-warning@+4 {{missing ':' after motion modifier - ignoring}} + // ge51-error@+3 {{expected expression}} + // lt51-error@+2 {{use of undeclared identifier 'present'}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present) + // expected-error@+2 {{expected expression}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(mapper(id):) + // ge51-error@+3 {{expected expression}} + // lt51-error@+2 {{use of undeclared identifier 'present'}} + // expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present:) + + // Check parsing with two modifiers. + // lt51-warning@+1 {{missing ':' after ) - ignoring}} + #pragma omp target update to(mapper(id), present: s) + // lt51-error@+3 {{use of undeclared identifier 'present'}} + // lt51-error@+2 {{use of undeclared identifier 'id'}} + // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present, mapper(id): s) + // lt51-warning@+1 {{missing ':' after ) - ignoring}} + #pragma omp target update to(mapper(id) present: s) + // lt51-error@+2 {{use of undeclared identifier 'present'}} + // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present mapper(id): s) + + // Check parsing with unnecessary commas. + // lt51-warning@+1 {{missing ':' after ) - ignoring}} + #pragma omp target update to(mapper(id),: s) + // lt51-error@+3 {{use of undeclared identifier 'present'}} + // lt51-error@+2 {{expected expression}} + // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present , : s) + // ge51-warning@+2 {{missing ':' after motion modifier - ignoring}} + // lt51-warning@+1 {{missing ':' after ) - ignoring}} + #pragma omp target update to(mapper(id),,: s) + // ge51-warning@+5 {{missing ':' after motion modifier - ignoring}} + // lt51-error@+4 {{use of undeclared identifier 'present'}} + // lt51-error@+3 {{expected expression}} + // lt51-error@+2 {{expected expression}} + // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present,,: s) + // lt51-warning@+1 {{missing ':' after ) - ignoring}} + #pragma omp target update to(mapper(id), present,: s) + // lt51-error@+4 {{use of undeclared identifier 'present'}} + // lt51-error@+3 {{use of undeclared identifier 'id'}} + // lt51-error@+2 {{expected expression}} + // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} + #pragma omp target update to(present, mapper(id),: s) + #pragma omp target update from(m) allocate(m) // expected-error {{unexpected OpenMP clause 'allocate' in directive '#pragma omp target update'}} { foo(); } int iarr[5][5]; -#pragma omp target update to(iarr[0:][1:2:-1]) // omp50-error {{section stride is evaluated to a non-positive value -1}} omp45-error {{expected ']'}} omp45-note {{to match this '['}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} +// ge50-error@+4 {{section stride is evaluated to a non-positive value -1}} +// lt50-error@+3 {{expected ']'}} +// lt50-note@+2 {{to match this '['}} +// expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} +#pragma omp target update to(iarr[0:][1:2:-1]) {} -#pragma omp target update from(iarr[0:][1:2:-1]) // omp50-error {{section stride is evaluated to a non-positive value -1}} omp45-error {{expected ']'}} omp45-note {{to match this '['}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} +// ge50-error@+4 {{section stride is evaluated to a non-positive value -1}} +// lt50-error@+3 {{expected ']'}} +// lt50-note@+2 {{to match this '['}} +// expected-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} +#pragma omp target update from(iarr[0:][1:2:-1]) return tmain(argc, argv); } diff --git a/clang/test/OpenMP/task_codegen.cpp b/clang/test/OpenMP/task_codegen.cpp index 2a561ce7744e6..922623499d494 100644 --- a/clang/test/OpenMP/task_codegen.cpp +++ b/clang/test/OpenMP/task_codegen.cpp @@ -33,12 +33,11 @@ int main() { char b; S s[2]; int arr[10][a]; -// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T]]* @{{.+}}) // CHECK: [[B_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS]], [[STRUCT_SHAREDS]]* [[CAPTURES:%.+]], i32 0, i32 0 // CHECK: store i8* [[B]], i8** [[B_REF]] // CHECK: [[S_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS]], [[STRUCT_SHAREDS]]* [[CAPTURES]], i32 0, i32 1 // CHECK: store [2 x [[STRUCT_S]]]* [[S]], [2 x [[STRUCT_S]]]** [[S_REF]] -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 33, i64 40, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY1:@.+]] to i32 (i32, i8*)*)) +// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 33, i64 40, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY1:@.+]] to i32 (i32, i8*)*)) // CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]] // CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS]]* [[CAPTURES]] to i8* @@ -46,7 +45,7 @@ int main() { // CHECK: [[PRIORITY_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 4 // CHECK: [[PRIORITY:%.+]] = bitcast %union{{.+}}* [[PRIORITY_REF_PTR]] to i32* // CHECK: store i32 {{.+}}, i32* [[PRIORITY]] -// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]]) +// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]]) #pragma omp task shared(a, b, s) priority(b) { a = 15; @@ -55,7 +54,7 @@ int main() { } // CHECK: [[S_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS1]], [[STRUCT_SHAREDS1]]* [[CAPTURES:%.+]], i32 0, i32 0 // CHECK: store [2 x [[STRUCT_S]]]* [[S]], [2 x [[STRUCT_S]]]** [[S_REF]] -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 8, +// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{[^,]+}}, i32 {{%.*}}, i32 1, i64 40, i64 8, // CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]] // CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS1]]* [[CAPTURES]] to i8* @@ -101,20 +100,20 @@ int main() { // CHECK: [[T0:%.*]] = getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 2 // CHECK: store i8 1, i8* [[T0]] // CHECK: bitcast [[KMP_DEPEND_INFO]]* [[DEP_BASE]] to i8* -// CHECK: call i32 @__kmpc_omp_task_with_deps([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]], i32 4, i8* %{{[^,]+}}, i32 0, i8* null) +// CHECK: call i32 @__kmpc_omp_task_with_deps([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]], i32 4, i8* %{{[^,]+}}, i32 0, i8* null) #pragma omp task shared(a, s) depend(in : a, b, s, arr[:]) { a = 15; s[1].a = 10; } -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*)) -// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]]) +// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*)) +// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]]) #pragma omp task untied { #pragma omp critical a = 1; } -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1, +// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 40, i64 1, // CHECK: getelementptr inbounds [2 x [[STRUCT_S]]], [2 x [[STRUCT_S]]]* [[S]], i64 0, i64 0 // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 0 // CHECK: ptrtoint [[STRUCT_S]]* %{{.+}} to i64 @@ -146,12 +145,12 @@ int main() { // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 2 // CHECK: store i8 3, i8* // CHECK: bitcast [[KMP_DEPEND_INFO]]* %{{.+}} to i8* -// CHECK: call i32 @__kmpc_omp_task_with_deps([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]], i32 2, i8* %{{[^,]+}}, i32 0, i8* null) +// CHECK: call i32 @__kmpc_omp_task_with_deps([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]], i32 2, i8* %{{[^,]+}}, i32 0, i8* null) #pragma omp task untied depend(out : s[0], arr[4:][b]) { a = 1; } -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1, +// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 40, i64 1, // CHECK: getelementptr inbounds [2 x [[STRUCT_S]]], [2 x [[STRUCT_S]]]* [[S]], i64 0, i64 0 // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 0 // CHECK: ptrtoint [[STRUCT_S]]* %{{.+}} to i64 @@ -183,12 +182,12 @@ int main() { // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 2 // CHECK: store i8 4, i8* // CHECK: bitcast [[KMP_DEPEND_INFO]]* %{{.+}} to i8* -// CHECK: call i32 @__kmpc_omp_task_with_deps([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]], i32 2, i8* %{{[^,]+}}, i32 0, i8* null) +// CHECK: call i32 @__kmpc_omp_task_with_deps([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]], i32 2, i8* %{{[^,]+}}, i32 0, i8* null) #pragma omp task untied depend(mutexinoutset: s[0], arr[4:][b]) { a = 1; } -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 40, i64 1, +// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 3, i64 40, i64 1, // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 0 // CHECK: store i64 ptrtoint (i32* @{{.+}} to i64), i64* // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 1 @@ -229,38 +228,38 @@ int main() { // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 2 // CHECK: store i8 3, i8* // CHECK: bitcast [[KMP_DEPEND_INFO]]* %{{.+}} to i8* -// CHECK: call i32 @__kmpc_omp_task_with_deps([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]], i32 3, i8* %{{[^,]+}}, i32 0, i8* null) +// CHECK: call i32 @__kmpc_omp_task_with_deps([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]], i32 3, i8* %{{[^,]+}}, i32 0, i8* null) #pragma omp task final(true) depend(inout: a, s[1], arr[:a][3:]) { a = 2; } -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY3:@.+]] to i32 (i32, i8*)*)) -// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]]) +// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 3, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY3:@.+]] to i32 (i32, i8*)*)) +// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]]) #pragma omp task final(true) { a = 2; } -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY4:@.+]] to i32 (i32, i8*)*)) -// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]]) + // CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY4:@.+]] to i32 (i32, i8*)*)) + // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]]) const bool flag = false; #pragma omp task final(flag) { a = 3; } -// CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]] -// CHECK: [[CMP:%.+]] = icmp ne i8 [[B_VAL]], 0 -// CHECK: [[FINAL:%.+]] = select i1 [[CMP]], i32 2, i32 0 -// CHECK: [[FLAGS:%.+]] = or i32 [[FINAL]], 1 -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 [[FLAGS]], i64 40, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY5:@.+]] to i32 (i32, i8*)*)) -// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]]) + // CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]] + // CHECK: [[CMP:%.+]] = icmp ne i8 [[B_VAL]], 0 + // CHECK: [[FINAL:%.+]] = select i1 [[CMP]], i32 2, i32 0 + // CHECK: [[FLAGS:%.+]] = or i32 [[FINAL]], 1 + // CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 [[FLAGS]], i64 40, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY5:@.+]] to i32 (i32, i8*)*)) + // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]]) int c __attribute__((aligned(128))); #pragma omp task final(b) shared(c) { a = 4; c = 5; } -// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*)) -// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]]) +// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i32 0, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*)) +// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 {{%.*}}, i8* [[ORIG_TASK_PTR]]) #pragma omp task untied { S s1; diff --git a/clang/test/OpenMP/taskloop_codegen.cpp b/clang/test/OpenMP/taskloop_codegen.cpp index 55e43ff3a1152..7402c2ad65eba 100644 --- a/clang/test/OpenMP/taskloop_codegen.cpp +++ b/clang/test/OpenMP/taskloop_codegen.cpp @@ -229,4 +229,20 @@ struct S { // CHECK: br label % // CHECK: ret i32 0 +class St { +public: + operator int(); + St &operator+=(int); +}; + +// CHECK-LABEL: taskloop_with_class +void taskloop_with_class() { + St s1; + // CHECK: [[TD:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @{{.+}}, i32 [[GTID:%.+]], i32 1, i64 88, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[TD_TYPE:%.+]]*)* @{{.+}} to i32 (i32, i8*)*)) + // CHECK: call void @__kmpc_taskloop(%struct.ident_t* @{{.+}}, i32 [[GTID]], i8* [[TD]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* bitcast (void ([[TD_TYPE]]*, [[TD_TYPE]]*, i32)* @{{.+}} to i8*)) +#pragma omp taskloop + for (St s = St(); s < s1; s += 1) { + } +} + #endif diff --git a/clang/test/PCH/cxx-variadic-templates.cpp b/clang/test/PCH/cxx-variadic-templates.cpp index 87b101d73c142..b1eed5adb647b 100644 --- a/clang/test/PCH/cxx-variadic-templates.cpp +++ b/clang/test/PCH/cxx-variadic-templates.cpp @@ -19,3 +19,8 @@ shared_ptr spi = shared_ptr::allocate_shared(1, 2); template struct A {}; template struct B {}; outer::inner<1, 2, A, B> i(A<1>{}, B<2>{}); + +void test_nondependent_pack() { + take_nondependent_pack(nullptr, nullptr); + take_nondependent_pack_2({}); +} diff --git a/clang/test/PCH/cxx-variadic-templates.h b/clang/test/PCH/cxx-variadic-templates.h index 50596cdf5dbf9..45395e9ae84aa 100644 --- a/clang/test/PCH/cxx-variadic-templates.h +++ b/clang/test/PCH/cxx-variadic-templates.h @@ -23,3 +23,8 @@ template struct outer { }; }; template struct outer; + +template void take_nondependent_pack(int (...arr)[sizeof(sizeof(T))]); + +template using hide = int; +template void take_nondependent_pack_2(outer...>); diff --git a/clang/test/PCH/cxx1y-lambdas.mm b/clang/test/PCH/cxx1y-lambdas.mm index f140a15215b8f..9c4c11970473b 100644 --- a/clang/test/PCH/cxx1y-lambdas.mm +++ b/clang/test/PCH/cxx1y-lambdas.mm @@ -39,6 +39,8 @@ int init_capture(T t) { return [&, x(t)] { return sizeof(x); }; } +auto with_pack = [](auto ...xs){}; + #else // CHECK-PRINT: T add_slowly @@ -55,4 +57,6 @@ int add(int x, int y) { // CHECK-PRINT: init_capture // CHECK-PRINT: [&, x(t)] +void use_with_pack() { with_pack(1, 2, 3); } + #endif diff --git a/clang/test/PCH/cxx2a-constraints-crash.cpp b/clang/test/PCH/cxx2a-constraints-crash.cpp new file mode 100644 index 0000000000000..637c55f0c879c --- /dev/null +++ b/clang/test/PCH/cxx2a-constraints-crash.cpp @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -std=c++2a -emit-pch %s -o %t +// RUN: %clang_cc1 -std=c++2a -include-pch %t -verify %s + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +template +concept not_same_as = true; + +template +struct subrange { + template R> + subrange(R) requires(Kind == 0); + + template R> + subrange(R) requires(Kind != 0); +}; + +template +subrange(R) -> subrange<42>; + +int main() { + int c; + subrange s(c); +} + +#endif diff --git a/clang/test/PCH/cxx2a-constraints.cpp b/clang/test/PCH/cxx2a-constraints.cpp index d8b79337c8f18..3f3b5e536cc93 100644 --- a/clang/test/PCH/cxx2a-constraints.cpp +++ b/clang/test/PCH/cxx2a-constraints.cpp @@ -24,6 +24,8 @@ template T> void h(T) {} template T> void i(T) {} template void i(T) {} +void j(SizedLike auto ...ints) {} + #else /*included pch*/ int main() { @@ -35,6 +37,7 @@ int main() { (void)h(1); (void)i('1'); (void)i(1); + (void)j(1, 2, 3); } -#endif // HEADER \ No newline at end of file +#endif // HEADER diff --git a/clang/test/Parser/cxx-template-decl.cpp b/clang/test/Parser/cxx-template-decl.cpp index 64e7ca921f575..7455b1d0ffe72 100644 --- a/clang/test/Parser/cxx-template-decl.cpp +++ b/clang/test/Parser/cxx-template-decl.cpp @@ -12,7 +12,7 @@ export template x; // expected-error {{expected '<' after 'template'}} export template class x0; // expected-warning {{exported templates are unsupported}} template < ; // expected-error {{expected template parameter}} \ // expected-error{{expected ',' or '>' in template-parameter-list}} \ -// expected-warning {{declaration does not declare anything}} +// expected-error {{declaration does not declare anything}} template struct x1; // expected-error {{expected ',' or '>' in template-parameter-list}} // verifies that we only walk to the ',' & still produce errors on the rest of the template parameters @@ -286,3 +286,12 @@ namespace PR45239 { template int b; template auto f() -> b<0>; // expected-error +{{}} } + +namespace PR46231 { + template; // expected-error {{declaration does not declare anything}} + template<>; // expected-error {{declaration does not declare anything}} + template; // expected-error {{declaration does not declare anything}} + template int; // expected-error {{declaration does not declare anything}} + template<> int; // expected-error {{declaration does not declare anything}} + template int; // expected-error {{declaration does not declare anything}} +} diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c index ed8601636554e..48d35c95aa570 100644 --- a/clang/test/Preprocessor/init-ppc64.c +++ b/clang/test/Preprocessor/init-ppc64.c @@ -408,21 +408,6 @@ // PPC64LE:#define __ppc64__ 1 // PPC64LE:#define __ppc__ 1 // -// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu a2q -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCA2Q %s -// -// PPCA2Q:#define _ARCH_A2 1 -// PPCA2Q:#define _ARCH_A2Q 1 -// PPCA2Q:#define _ARCH_PPC 1 -// PPCA2Q:#define _ARCH_PPC64 1 -// PPCA2Q:#define _ARCH_QP 1 -// -// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-bgq-linux -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCBGQ %s -// -// PPCBGQ:#define __THW_BLUEGENE__ 1 -// PPCBGQ:#define __TOS_BGQ__ 1 -// PPCBGQ:#define __bg__ 1 -// PPCBGQ:#define __bgq__ 1 -// // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu 630 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC630 %s // // PPC630:#define _ARCH_630 1 @@ -1069,7 +1054,6 @@ // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s -// RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv1-qpx < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64le-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s diff --git a/clang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext b/clang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext index 64d18116ed647..0f39af77715dd 100644 --- a/clang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext +++ b/clang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext @@ -2,7 +2,7 @@ :ir main # Func Hash: -34137660316 +1063705162469825436 # Num Counters: 2 # Counter Values: diff --git a/clang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext b/clang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext index 4903f56c419e0..d60162c0f5294 100644 --- a/clang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext +++ b/clang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext @@ -3,7 +3,7 @@ :entry_first main # Func Hash: -34137660316 +1063705162469825436 # Num Counters: 2 # Counter Values: diff --git a/clang/test/Sema/attr-flag-enum.c b/clang/test/Sema/attr-flag-enum.c index ae3e3ad5ab99f..467afd950973d 100644 --- a/clang/test/Sema/attr-flag-enum.c +++ b/clang/test/Sema/attr-flag-enum.c @@ -6,6 +6,10 @@ enum __attribute__((flag_enum)) flag { ec = 0x8, }; +enum __attribute__((flag_enum)) { + g = 0x7, // expected-warning {{enumeration value 'g' is out of range of flags in enumeration type ''}} +}; + enum __attribute__((flag_enum)) flag2 { ga = 0x1, gb = 0x4, diff --git a/clang/test/SemaCXX/PR16677.cpp b/clang/test/SemaCXX/PR16677.cpp index efa4faaacd693..3ad76c6d861d7 100644 --- a/clang/test/SemaCXX/PR16677.cpp +++ b/clang/test/SemaCXX/PR16677.cpp @@ -11,5 +11,5 @@ template { // expected-error{{'Derived' cannot be defined in a type specifier}} Class_With_Destructor member; }; // expected-error{{expected ',' or '>' in template-parameter-list}} - // expected-warning@-1{{declaration does not declare anything}} + // expected-error@-1{{declaration does not declare anything}} diff --git a/clang/test/SemaCXX/array-bounds.cpp b/clang/test/SemaCXX/array-bounds.cpp index 495ccaf71bd6a..47be6c2423dc1 100644 --- a/clang/test/SemaCXX/array-bounds.cpp +++ b/clang/test/SemaCXX/array-bounds.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -verify -std=c++11 %s +// RUN: %clang_cc1 -verify -std=c++14 %s int foo() { int x[2]; // expected-note 4 {{array 'x' declared here}} @@ -309,3 +309,14 @@ namespace PR41087 { foo(); // expected-note 1{{in instantiation of function template specialization}} }; } + +namespace var_template_array { +template int arr[2]; // expected-note {{array 'arr' declared here}} +template <> int arr[1]; // expected-note {{array 'arr' declared here}} + +void test() { + arr[1] = 0; // ok + arr[2] = 0; // expected-warning {{array index 2 is past the end of the array (which contains 2 elements)}} + arr[1] = 0; // expected-warning {{array index 1 is past the end of the array (which contains 1 element)}} +} +} // namespace var_template_array diff --git a/clang/test/SemaCXX/attr-unused.cpp b/clang/test/SemaCXX/attr-unused.cpp index e3878152eca97..5bca693864e32 100644 --- a/clang/test/SemaCXX/attr-unused.cpp +++ b/clang/test/SemaCXX/attr-unused.cpp @@ -15,5 +15,5 @@ void f() { }; (void) i; - C(); // expected-warning {{'C' was marked unused but was used}} + C(); // expected-warning {{'C' was marked unused but was used}} } diff --git a/clang/test/SemaCXX/auto-type-from-cxx.cpp b/clang/test/SemaCXX/auto-type-from-cxx.cpp index 961402f7d9331..21620f73be7e9 100644 --- a/clang/test/SemaCXX/auto-type-from-cxx.cpp +++ b/clang/test/SemaCXX/auto-type-from-cxx.cpp @@ -4,7 +4,8 @@ struct A { operator __auto_type() {} // expected-error {{'__auto_type' not allowed in conversion function type}} }; -__auto_type a() -> int; // expected-error {{'__auto_type' not allowed in function return type}} +__auto_type a() -> int; // expected-error {{function with trailing return type must specify return type 'auto'}} +__auto_type a2(); // expected-error {{'__auto_type' not allowed in function return type}} template __auto_type b() { return T::x; } // expected-error {{'__auto_type' not allowed in function return type}} auto c() -> __auto_type { __builtin_unreachable(); } // expected-error {{'__auto_type' not allowed in function return type}} diff --git a/clang/test/SemaCXX/cxx1y-deduced-return-type.cpp b/clang/test/SemaCXX/cxx1y-deduced-return-type.cpp index 687ecf28ee3d7..958728b104876 100644 --- a/clang/test/SemaCXX/cxx1y-deduced-return-type.cpp +++ b/clang/test/SemaCXX/cxx1y-deduced-return-type.cpp @@ -620,3 +620,11 @@ namespace PR33222 { // FIXME: suppress this follow-on error: expected-error@-1 {{cannot initialize}} template<> int B::q() { return 0; } // expected-error {{return type}} } + +namespace PR46637 { + using A = auto () -> auto; // expected-error {{'auto' not allowed in type alias}} + using B = auto (*)() -> auto; // expected-error {{'auto' not allowed in type alias}} + template auto> struct X {}; // expected-error {{'auto' not allowed in template parameter until C++17}} + template struct Y { T x; }; + Y auto> y; // expected-error {{'auto' not allowed in template argument}} +} diff --git a/clang/test/SemaCXX/cxx1y-generic-lambdas.cpp b/clang/test/SemaCXX/cxx1y-generic-lambdas.cpp index 13ab7aae6c32b..52caaa59dd30f 100644 --- a/clang/test/SemaCXX/cxx1y-generic-lambdas.cpp +++ b/clang/test/SemaCXX/cxx1y-generic-lambdas.cpp @@ -1012,3 +1012,11 @@ namespace PR32638 { [](auto x) noexcept(noexcept(x)) { } (0); } } + +namespace PR46637 { + auto x = [](auto (*p)()) { return p(); }; + auto y = [](auto (*p)() -> auto) { return p(); }; + int f(); + void *v = x(f); // expected-error {{cannot initialize a variable of type 'void *' with an rvalue of type 'int'}} + void *w = y(f); // expected-error {{cannot initialize a variable of type 'void *' with an rvalue of type 'int'}} +} diff --git a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp index fc49ec88d5537..1a24c66805690 100644 --- a/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp +++ b/clang/test/SemaCXX/cxx1y-variable-templates_in_class.cpp @@ -164,7 +164,7 @@ namespace constexpred { // expected-error {{non-static data member cannot be constexpr; did you intend to make it const?}} template constexpr float right = 5; // expected-error {{non-static data member cannot be constexpr; did you intend to make it static?}} template<> static constexpr int right = 7; - template<> static constexpr float right; // expected-error {{requires an initializer}} + template <> static constexpr float right; // expected-error {{declaration of constexpr static data member 'right' requires an initializer}} template static constexpr int right; // expected-error {{expected '<' after 'template'}} }; } diff --git a/clang/test/SemaCXX/default2.cpp b/clang/test/SemaCXX/default2.cpp index 7651233f8636f..c51d272853e72 100644 --- a/clang/test/SemaCXX/default2.cpp +++ b/clang/test/SemaCXX/default2.cpp @@ -119,7 +119,7 @@ class C2 { template class C3; template <> class C3 { - static void g(int = f()); // expected-error {{use of default argument to function 'f' that is declared later in class 'C3'}} + static void g(int = f()); // expected-error {{use of default argument to function 'f' that is declared later in class 'C3'}} static int f(int = 10); // expected-note {{default argument declared here}} }; diff --git a/clang/test/SemaCXX/extern-c.cpp b/clang/test/SemaCXX/extern-c.cpp index 1a32beec7baab..68d1494b94918 100644 --- a/clang/test/SemaCXX/extern-c.cpp +++ b/clang/test/SemaCXX/extern-c.cpp @@ -259,3 +259,8 @@ namespace PR35697 { } } } + +namespace PR46859 { + extern "bogus" // expected-error {{unknown linkage language}} + template struct X {}; // expected-error {{templates can only be declared in namespace or class scope}} +} diff --git a/clang/test/SemaCXX/incomplete-call.cpp b/clang/test/SemaCXX/incomplete-call.cpp index 46f470e4a8810..208daa2988a66 100644 --- a/clang/test/SemaCXX/incomplete-call.cpp +++ b/clang/test/SemaCXX/incomplete-call.cpp @@ -40,7 +40,7 @@ void g() { A (B::*mfp)() = 0; (b.*mfp)(); // expected-error {{calling function with incomplete return type 'A'}} - ft(42); // expected-error {{calling 'ft' with incomplete return type 'A'}} + ft(42); // expected-error {{calling 'ft' with incomplete return type 'A'}} } diff --git a/clang/test/SemaCXX/invalid-template-params.cpp b/clang/test/SemaCXX/invalid-template-params.cpp index 0c463fe13d5b0..21220f3fea345 100644 --- a/clang/test/SemaCXX/invalid-template-params.cpp +++ b/clang/test/SemaCXX/invalid-template-params.cpp @@ -5,7 +5,7 @@ template class Foo { // expected-note@-1 {{'UBar' declared here}} void foo1(); // expected-error {{a non-type template parameter cannot have type 'class UBar'}} // expected-error@-1 {{expected ',' or '>' in template-parameter-list}} - // expected-warning@-2 {{declaration does not declare anything}} + // expected-error@-2 {{declaration does not declare anything}} }; Foo::UBar g1; // expected-error {{no type named 'UBar' in 'Foo'}} @@ -16,7 +16,7 @@ class C0 { struct S0 {}; // expected-error {{'S0' cannot be defined in a type specifier}} // expected-error@-1 {{cannot combine with previous 'type-name' declaration specifier}} // expected-error@-2 {{expected ',' or '>' in template-parameter-list}} - // expected-warning@-3 {{declaration does not declare anything}} + // expected-error@-3 {{declaration does not declare anything}} C0() : m(new S0) {} // expected-error {{expected '(' for function-style cast or type construction}} // expected-error@-1 {{expected expression}} S0 *m; // expected-error {{expected member name or ';' after declaration specifiers}} diff --git a/clang/test/SemaCXX/references.cpp b/clang/test/SemaCXX/references.cpp index eaab1ae833e4e..f059eb6e64602 100644 --- a/clang/test/SemaCXX/references.cpp +++ b/clang/test/SemaCXX/references.cpp @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s -// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s +// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s +// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s int g(int); void f() { @@ -114,6 +115,14 @@ void test8(int& const,// expected-error{{'const' qualifier may not be applied to void restrict_ref(int &__restrict); // ok } +namespace var_template { +#if __cplusplus >= 201402L +int i; +template int &ref = i; // ok +template <> int &ref; // expected-error {{declaration of reference variable 'ref' requires an initializer}} +#endif +} // namespace var_template + template int const_param(const T) {} int const_ref_param = const_param(const_ref_param); // no-warning diff --git a/clang/test/SemaCXX/return-void.cpp b/clang/test/SemaCXX/return-void.cpp index b3aa203133dc3..c72fbdfae9fa9 100644 --- a/clang/test/SemaCXX/return-void.cpp +++ b/clang/test/SemaCXX/return-void.cpp @@ -4,7 +4,7 @@ void f1() { return {1,2}; } // expected-error {{void function 'f1' must not retu template void f2() { return {1,2}; } // expected-error {{void function 'f2' must not return a value}} -template <> void f2() { return {1,2}; } // expected-error {{void function 'f2' must not return a value}} +template <> void f2() { return {1, 2}; } // expected-error {{void function 'f2' must not return a value}} void test_f2() { f2(); diff --git a/clang/test/SemaCXX/trailing-return-0x.cpp b/clang/test/SemaCXX/trailing-return-0x.cpp index c6b22c54be0c1..4834aaf5277a8 100644 --- a/clang/test/SemaCXX/trailing-return-0x.cpp +++ b/clang/test/SemaCXX/trailing-return-0x.cpp @@ -104,3 +104,10 @@ namespace PR16273 { }; } +namespace PR46637 { + using A = auto () -> auto; // expected-error {{'auto' not allowed in function return type}} + using B = auto (*)() -> auto; // expected-error {{'auto' not allowed in function return type}} + template auto> struct X {}; // expected-error {{'auto' not allowed in function return type}} + template struct Y { T x; }; + Y auto> y; // expected-error {{'auto' not allowed in function return type}} +} diff --git a/clang/test/SemaCXX/void-argument.cpp b/clang/test/SemaCXX/void-argument.cpp new file mode 100644 index 0000000000000..8354347f5559e --- /dev/null +++ b/clang/test/SemaCXX/void-argument.cpp @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +void fun( + void a, // expected-error{{'void' must be the first and only parameter if specified}} + double b, + int c, + void d, // expected-error{{'void' must be the first and only parameter if specified}} + int e, + void f) // expected-error{{'void' must be the first and only parameter if specified}} +{} + +void foo( + int a, + void, // expected-error{{'void' must be the first and only parameter if specified}} + int b); + +void bar( + void, // expected-error{{'void' must be the first and only parameter if specified}} + ...); + +struct S { + S( + void, // expected-error{{'void' must be the first and only parameter if specified}} + void); // expected-error{{'void' must be the first and only parameter if specified}} +}; diff --git a/clang/test/SemaCXX/warn-func-not-needed.cpp b/clang/test/SemaCXX/warn-func-not-needed.cpp index 65721f44f5707..cb3cae4cd6c76 100644 --- a/clang/test/SemaCXX/warn-func-not-needed.cpp +++ b/clang/test/SemaCXX/warn-func-not-needed.cpp @@ -1,13 +1,23 @@ // RUN: %clang_cc1 -fsyntax-only -verify -Wall %s namespace test1 { - static void f() {} // expected-warning {{is not needed and will not be emitted}} - static void f(); - template - void foo() { - f(); - } +static void f() {} // expected-warning {{function 'f' is not needed and will not be emitted}} +static void f(); +template +void foo() { + f(); +} +} + +namespace test1_template { +template static void f() {} +template <> void f() {} // expected-warning {{function 'f' is not needed and will not be emitted}} +template +void foo() { + f(); + f(); } +} // namespace test1_template namespace test2 { static void f() {} diff --git a/clang/test/SemaCXX/warn-large-by-value-copy.cpp b/clang/test/SemaCXX/warn-large-by-value-copy.cpp index 3e419ec08f075..309fdc75dbd79 100644 --- a/clang/test/SemaCXX/warn-large-by-value-copy.cpp +++ b/clang/test/SemaCXX/warn-large-by-value-copy.cpp @@ -16,6 +16,14 @@ S100 f100(S100 s) { return s; } S101 f101(S101 s) { return s; } // expected-warning {{return value of 'f101' is a large (101 bytes) pass-by-value object}} \ // expected-warning {{'s' is a large (101 bytes) pass-by-value argument}} +void f101_no_param_name(S101) {} // expected-warning {{'' is a large (101 bytes) pass-by-value argument}} + +// FIXME: Don't warn when when the return value is subject to (N)RVO. + +template T foo_template(T); +template <> S101 foo_template(S101) { return S101(); } // expected-warning {{return value of 'foo_template' is a large}} + // expected-warning@-1 {{'' is a large (101 bytes) pass-by-value argument}} + typedef int Arr[200]; void farr(Arr a) { } diff --git a/clang/test/SemaCXX/warn-member-not-needed.cpp b/clang/test/SemaCXX/warn-member-not-needed.cpp index 61bb3488c6116..c48447719ba99 100644 --- a/clang/test/SemaCXX/warn-member-not-needed.cpp +++ b/clang/test/SemaCXX/warn-member-not-needed.cpp @@ -1,11 +1,19 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -Wunneeded-member-function %s +// RUN: %clang_cc1 -fsyntax-only -verify -Wunneeded-member-function -Wno-unused-template %s namespace { class A { - void g() {} // expected-warning {{is not needed and will not be emitted}} + void g() {} // expected-warning {{member function 'g' is not needed and will not be emitted}} + template void gt(T) {} + template <> void gt(int) {} // expected-warning {{member function 'gt' is not needed and will not be emitted}} + template <> void gt(float) {} // expected-warning {{member function 'gt' is not needed and will not be emitted}} + template void foo() { g(); + gt(0); + gt(0.0f); + gt(0.0); } }; + template void A::gt(double); // no-warning } diff --git a/clang/test/SemaCXX/warn-pure-virtual-call-from-ctor-dtor.cpp b/clang/test/SemaCXX/warn-pure-virtual-call-from-ctor-dtor.cpp index 789935e3470ac..9acf84c6ce8c5 100644 --- a/clang/test/SemaCXX/warn-pure-virtual-call-from-ctor-dtor.cpp +++ b/clang/test/SemaCXX/warn-pure-virtual-call-from-ctor-dtor.cpp @@ -22,8 +22,8 @@ struct C { }; template struct TA { - TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} - ~TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the destructor of 'TA'}} + TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} + ~TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the destructor of 'TA'}} virtual void f() = 0; // expected-note 2{{'f' declared here}} }; @@ -35,8 +35,8 @@ template <> struct TA { }; template <> struct TA { - TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} - ~TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the destructor of 'TA'}} + TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} + ~TA() { f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the destructor of 'TA'}} virtual void f() = 0; // expected-note 2{{'f' declared here}} }; diff --git a/clang/test/SemaCXX/warn-pure-virtual-kext.cpp b/clang/test/SemaCXX/warn-pure-virtual-kext.cpp index 8431e202ad714..d23456fa4fd5c 100644 --- a/clang/test/SemaCXX/warn-pure-virtual-kext.cpp +++ b/clang/test/SemaCXX/warn-pure-virtual-kext.cpp @@ -10,7 +10,7 @@ struct A { template struct TA { virtual void f() = 0; // expected-note {{'f' declared here}} - TA() { TA::f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} // expected-note {{qualified call to 'TA'::'f' is treated as a virtual call to 'f' due to -fapple-kext}} + TA() { TA::f(); } // expected-warning {{call to pure virtual member function 'f' has undefined behavior; overrides of 'f' in subclasses are not available in the constructor of 'TA'}} // expected-note {{qualified call to 'TA'::'f' is treated as a virtual call to 'f' due to -fapple-kext}} }; struct B : TA { // expected-note {{in instantiation of member function 'TA::TA' requested here}} diff --git a/clang/test/SemaCXX/warn-unused-filescoped.cpp b/clang/test/SemaCXX/warn-unused-filescoped.cpp index 7ea398feb2b1e..d53608003b16d 100644 --- a/clang/test/SemaCXX/warn-unused-filescoped.cpp +++ b/clang/test/SemaCXX/warn-unused-filescoped.cpp @@ -1,14 +1,15 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -Wunused -Wunused-template -Wunused-member-function -Wno-unused-local-typedefs -Wno-c++11-extensions -std=c++98 %s +// RUN: %clang_cc1 -fsyntax-only -verify -Wunused -Wunused-template -Wunused-member-function -Wno-unused-local-typedefs \ +// RUN: -Wno-c++11-extensions -Wno-c++14-extensions -std=c++98 %s // RUN: %clang_cc1 -fsyntax-only -verify -Wunused -Wunused-template -Wunused-member-function -Wno-unused-local-typedefs -std=c++14 %s #ifdef HEADER -static void headerstatic() {} // expected-warning{{unused}} +static void headerstatic() {} // expected-warning{{unused function 'headerstatic'}} static inline void headerstaticinline() {} namespace { - void headeranon() {} // expected-warning{{unused}} - inline void headerinlineanon() {} +void headeranon() {} // expected-warning{{unused function 'headeranon'}} +inline void headerinlineanon() {} } namespace test7 @@ -43,31 +44,31 @@ namespace pr19713 { #define HEADER #include "warn-unused-filescoped.cpp" -static void f1(); // expected-warning{{unused}} +static void f1(); // expected-warning{{unused function 'f1'}} namespace { - void f2(); // expected-warning{{unused}} +void f2(); // expected-warning{{unused function 'f2'}} - void f3() { } // expected-warning{{unused}} +void f3() {} // expected-warning{{unused function 'f3'}} - struct S { - void m1() { } // expected-warning{{unused}} - void m2(); // expected-warning{{unused}} - void m3(); - S(const S&); - void operator=(const S&); - }; +struct S { + void m1() {} // expected-warning{{unused member function 'm1'}} + void m2(); // expected-warning{{unused member function 'm2'}} + void m3(); + S(const S &); + void operator=(const S &); +}; template struct TS { void m(); }; - template <> void TS::m() { } // expected-warning{{unused}} + template <> void TS::m() {} // expected-warning{{unused member function 'm'}} template - void tf() { } // expected-warning{{unused}} - template <> void tf() { } // expected-warning{{unused}} - + void tf() {} // expected-warning{{unused function template 'tf'}} + template <> void tf() {} // expected-warning{{unused function 'tf'}} + struct VS { virtual void vm() { } }; @@ -77,27 +78,31 @@ namespace { }; } -void S::m3() { } // expected-warning{{unused}} +void S::m3() {} // expected-warning{{unused member function 'm3'}} -static inline void f4() { } // expected-warning{{unused}} -const unsigned int cx = 0; // expected-warning{{unused}} +static inline void f4() {} // expected-warning{{unused function 'f4'}} +const unsigned int cx = 0; // expected-warning{{unused variable 'cx'}} const unsigned int cy = 0; int f5() { return cy; } -static int x1; // expected-warning{{unused}} +static int x1; // expected-warning{{unused variable 'x1'}} namespace { - int x2; // expected-warning{{unused}} - - struct S2 { - static int x; // expected-warning{{unused}} - }; +int x2; // expected-warning{{unused variable 'x2'}} + +struct S2 { + static int x; // expected-warning{{unused variable 'x'}} +}; template struct TS2 { static int x; }; - template <> int TS2::x; // expected-warning{{unused}} + template <> int TS2::x; // expected-warning{{unused variable 'x'}} + + template int vt = 0; // expected-warning {{unused variable template 'vt'}} + template int vt = 0; + template <> int vt = 0; // expected-warning {{unused variable 'vt'}} } namespace PR8841 { @@ -120,17 +125,21 @@ namespace PR8841 { namespace test4 { namespace { struct A {}; } - void test(A a); // expected-warning {{unused function}} + void test(A a); // expected-warning {{unused function 'test'}} extern "C" void test4(A a); } namespace rdar8733476 { - static void foo() { } // expected-warning {{not needed and will not be emitted}} +static void foo() {} // expected-warning {{function 'foo' is not needed and will not be emitted}} +template static void foo_t() {} // expected-warning {{unused function template 'foo_t'}} +template <> void foo_t() {} // expected-warning {{function 'foo_t' is not needed and will not be emitted}} - template - void bar() { - foo(); - } +template +void bar() { + foo(); + foo_t(); + foo_t(); +} } namespace test5 { @@ -142,8 +151,15 @@ namespace test5 { // FIXME: We should produce warnings for both of these. static const int m = n; int x = sizeof(m); - static const double d = 0.0; // expected-warning{{not needed and will not be emitted}} + static const double d = 0.0; // expected-warning{{variable 'd' is not needed and will not be emitted}} int y = sizeof(d); + + namespace { + // FIXME: Should be "unused variable template 'var_t'" instead. + template const double var_t = 0; // expected-warning {{unused variable 'var_t'}} + template <> const double var_t = 0; // expected-warning {{variable 'var_t' is not needed and will not be emitted}} + int z = sizeof(var_t); // expected-warning {{unused variable 'z'}} + } // namespace } namespace unused_nested { @@ -158,9 +174,9 @@ namespace unused_nested { namespace unused { struct { - void func() { // expected-warning {{unused member function}} + void func() { // expected-warning {{unused member function 'func'}} } - } x; // expected-warning {{unused variable}} + } x; // expected-warning {{unused variable 'x'}} } namespace test6 { @@ -201,8 +217,8 @@ static void func() {} } namespace test9 { -template -static void completeRedeclChainForTemplateSpecialization() { } // expected-warning {{unused}} +template +static void completeRedeclChainForTemplateSpecialization() {} // expected-warning {{unused function template 'completeRedeclChainForTemplateSpecialization'}} } namespace test10 { @@ -216,8 +232,8 @@ constexpr T pi = T(3.14); namespace pr19713 { #if __cplusplus >= 201103L // FIXME: We should warn on both of these. - static constexpr int constexpr3() { return 1; } // expected-warning {{unused}} - constexpr int constexpr4() { return 2; } +static constexpr int constexpr3() { return 1; } // expected-warning {{unused function 'constexpr3'}} +constexpr int constexpr4() { return 2; } #endif } diff --git a/clang/test/SemaCXX/warn-variable-not-needed.cpp b/clang/test/SemaCXX/warn-variable-not-needed.cpp index 0fb0f8151b449..103be189068f8 100644 --- a/clang/test/SemaCXX/warn-variable-not-needed.cpp +++ b/clang/test/SemaCXX/warn-variable-not-needed.cpp @@ -2,9 +2,14 @@ namespace test1 { static int abc = 42; // expected-warning {{variable 'abc' is not needed and will not be emitted}} + + namespace { + template int abc_template = 0; + template <> int abc_template = 0; // expected-warning {{variable 'abc_template' is not needed and will not be emitted}} + } // namespace template int foo(void) { - return abc; + return abc + abc_template + abc_template; } } diff --git a/clang/test/SemaOpenCL/to_addr_builtin.cl b/clang/test/SemaOpenCL/to_addr_builtin.cl index ff2d7807356a2..8f4a20ac7662a 100644 --- a/clang/test/SemaOpenCL/to_addr_builtin.cl +++ b/clang/test/SemaOpenCL/to_addr_builtin.cl @@ -15,7 +15,7 @@ void test(void) { // expected-error@-2{{implicit declaration of function 'to_global' is invalid in OpenCL}} // expected-warning@-3{{incompatible integer to pointer conversion assigning to '__global int *__private' from 'int'}} #else - // expected-error@-5{{invalid number of arguments to function: 'to_global'}} + // expected-error@-5{{too many arguments to function call, expected 1, have 2}} #endif int x; diff --git a/clang/test/SemaOpenCL/usm-address-spaces-conversions.cl b/clang/test/SemaOpenCL/usm-address-spaces-conversions.cl new file mode 100644 index 0000000000000..35c333630da8e --- /dev/null +++ b/clang/test/SemaOpenCL/usm-address-spaces-conversions.cl @@ -0,0 +1,67 @@ +// RUN: %clang_cc1 %s -ffake-address-space-map -verify -pedantic -fsyntax-only -cl-std=CL2.0 +// RUN: %clang_cc1 %s -ffake-address-space-map -verify -pedantic -fsyntax-only -cl-std=CL2.0 -DGENERIC +// RUN: %clang_cc1 %s -ffake-address-space-map -verify -pedantic -fsyntax-only -cl-std=CL2.0 -DCONSTANT +// RUN: %clang_cc1 %s -ffake-address-space-map -verify -pedantic -fsyntax-only -cl-std=CL2.0 -DLOCAL + +/* USM (unified shared memory) extension for OpenCLC 2.0 adds two new address + * spaces: global_device and global_host that are a subset of __global address + * space. As ISO/IEC TR 18037 5.1.3 declares - it's possible to implicitly + * convert a subset address space to a superset address space, while conversion + * in a reversed direction could be achived only with an explicit cast */ + +#ifdef GENERIC +#define AS_COMP __generic +#else +#define AS_COMP __global +#endif // GENERIC + +#ifdef CONSTANT +#define AS_INCOMP __constant +#elif LOCAL +#define AS_INCOMP __local +#else // PRIVATE +#define AS_INCOMP __private +#endif // CONSTANT + +void test(AS_COMP int *arg_comp, + __attribute__((opencl_global_device)) int *arg_device, + __attribute__((opencl_global_host)) int *arg_host) { + AS_COMP int *var_glob1 = arg_device; + AS_COMP int *var_glob2 = arg_host; + AS_COMP int *var_glob3 = (AS_COMP int *)arg_device; + AS_COMP int *var_glob4 = (AS_COMP int *)arg_host; + arg_device = (__attribute__((opencl_global_device)) int *)arg_comp; + arg_host = (__attribute__((opencl_global_host)) int *)arg_comp; +#ifdef GENERIC + // expected-error@+6{{assigning '__generic int *__private' to '__global_device int *__private' changes address space of pointer}} + // expected-error@+6{{assigning '__generic int *__private' to '__global_host int *__private' changes address space of pointer}} +#else + // expected-error@+3{{assigning '__global int *__private' to '__global_device int *__private' changes address space of pointer}} + // expected-error@+3{{assigning '__global int *__private' to '__global_host int *__private' changes address space of pointer}} +#endif // GENERIC + arg_device = arg_comp; + arg_host = arg_comp; + +#ifdef CONSTANT + // expected-error@+15{{initializing '__constant int *__private' with an expression of type '__global_device int *__private' changes address space of pointer}} + // expected-error@+15{{initializing '__constant int *__private' with an expression of type '__global_host int *__private' changes address space of pointer}} + // expected-error@+15{{initializing '__constant int *__private' with an expression of type '__global_device int *' changes address space of pointer}} + // expected-error@+16{{initializing '__constant int *__private' with an expression of type '__global_host int *' changes address space of pointer}} +#elif LOCAL + // expected-error@+10{{initializing '__local int *__private' with an expression of type '__global_device int *__private' changes address space of pointer}} + // expected-error@+10{{initializing '__local int *__private' with an expression of type '__global_host int *__private' changes address space of pointer}} + // expected-error@+10{{initializing '__local int *__private' with an expression of type '__global_device int *' changes address space of pointer}} + // expected-error@+11{{initializing '__local int *__private' with an expression of type '__global_host int *' changes address space of pointer}} +#else // PRIVATE + // expected-error@+5{{initializing '__private int *__private' with an expression of type '__global_device int *__private' changes address space of pointer}} + // expected-error@+5{{initializing '__private int *__private' with an expression of type '__global_host int *__private' changes address space of pointer}} + // expected-error@+5{{initializing '__private int *__private' with an expression of type '__global_device int *' changes address space of pointer}} + // expected-error@+6{{initializing '__private int *__private' with an expression of type '__global_host int *' changes address space of pointer}} +#endif // CONSTANT + AS_INCOMP int *var_incomp1 = arg_device; + AS_INCOMP int *var_incomp2 = arg_host; + AS_INCOMP int *var_incomp3 = + (__attribute__((opencl_global_device)) int *)arg_device; + AS_INCOMP int *var_incomp4 = + (__attribute__((opencl_global_host)) int *)arg_host; +} diff --git a/clang/test/SemaOpenCLCXX/address-space-templates.cl b/clang/test/SemaOpenCLCXX/address-space-templates.cl index 6b304d2fdda45..be187de5684b0 100644 --- a/clang/test/SemaOpenCLCXX/address-space-templates.cl +++ b/clang/test/SemaOpenCLCXX/address-space-templates.cl @@ -22,10 +22,28 @@ void foo3() { __private T ii; // expected-error{{conflicting address space qualifiers are provided between types '__private T' and '__global int'}} } +template struct remove_reference { typedef _Tp type; }; +template struct remove_reference<_Tp &> { typedef _Tp type; }; +template struct as_pointer { + typedef typename remove_reference<_Tp>::type* type; +}; + +struct rep { + // CHECK |-CXXConstructorDecl {{.*}} rep 'void (const __generic rep &__private) __generic' + template::type> + rep(U&& v) {} +}; + +struct rep_outer : private rep { + rep_outer() + : rep(0) {} +}; + void bar() { S sintgl; // expected-note{{in instantiation of template class 'S' requested here}} foo1<__local int>(1); // expected-error{{no matching function for call to 'foo1'}} foo2<__global int>(0); foo3<__global int>(); // expected-note{{in instantiation of function template specialization 'foo3<__global int>' requested here}} + rep_outer r; } diff --git a/clang/test/SemaSYCL/union-kernel-param.cpp b/clang/test/SemaSYCL/union-kernel-param.cpp new file mode 100644 index 0000000000000..5267cf458b285 --- /dev/null +++ b/clang/test/SemaSYCL/union-kernel-param.cpp @@ -0,0 +1,45 @@ +// RUN: %clang_cc1 -I %S/Inputs -fsycl -fsycl-is-device -ast-dump %s | FileCheck %s +// expected-no-diagnostics + +// This test checks that compiler generates correct kernel arguments for +// union. + +#include + +using namespace cl::sycl; + +typedef float realw; + +typedef union dpct_type_54e08f { + float cuda; +} gpu_realw_mem; + +void call_some_dummy_kernel(float data) { + data = 2.0f; +} + +template +__attribute__((sycl_kernel)) void parallel_for(Type lambda) { + lambda(); +} + +int main() { + gpu_realw_mem accel; + + parallel_for( + [=]() { + call_some_dummy_kernel(accel.cuda); + }); +} + +// Check kernel parameters +// CHECK: FunctionDecl {{.*}}kernel{{.*}} 'void (gpu_realw_mem)' +// CHECK-NEXT: ParmVarDecl {{.*}} used _arg_ 'gpu_realw_mem':'dpct_type_54e08f' +// Check kernel inits +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: DeclStmt +// CHECK-NEXT: VarDecl {{.*}} cinit +// CHECK-NEXT: InitListExpr +// CHECK-NEXT: CXXConstructExpr {{.*}} 'gpu_realw_mem':'dpct_type_54e08f' 'void (const dpct_type_54e08f &) noexcept' +// CHECK: ImplicitCastExpr +// CHECK: DeclRefExpr {{.*}} lvalue ParmVar {{.*}} '_arg_' 'gpu_realw_mem':'dpct_type_54e08f' diff --git a/clang/test/SemaTemplate/address_space-dependent.cpp b/clang/test/SemaTemplate/address_space-dependent.cpp index fa6cc68bba419..09ae0a620d9f3 100644 --- a/clang/test/SemaTemplate/address_space-dependent.cpp +++ b/clang/test/SemaTemplate/address_space-dependent.cpp @@ -101,7 +101,7 @@ int main() { car<1, 2, 3>(); // expected-note {{in instantiation of function template specialization 'car<1, 2, 3>' requested here}} HasASTemplateFields<1> HASTF; neg<-1>(); // expected-note {{in instantiation of function template specialization 'neg<-1>' requested here}} - correct<0x7FFFEE>(); + correct<0x7FFFF1>(); tooBig<8388650>(); // expected-note {{in instantiation of function template specialization 'tooBig<8388650>' requested here}} __attribute__((address_space(1))) char *x; diff --git a/clang/test/SemaTemplate/alias-template-nondependent.cpp b/clang/test/SemaTemplate/alias-template-nondependent.cpp new file mode 100644 index 0000000000000..e8ea16483a09f --- /dev/null +++ b/clang/test/SemaTemplate/alias-template-nondependent.cpp @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -std=c++20 -verify %s + +namespace PR46377 { + template using IntPtr = int*; + template auto non_dependent_typedef() { + typedef int(*P)(IntPtr...); + return P(); + } + template auto non_dependent_alias() { + using P = int(*)(IntPtr...); + return P(); + } + template auto non_dependent_via_sizeof() { + using P = int(*)(int(...pack)[sizeof(sizeof(T))]); // expected-error {{invalid application of 'sizeof'}} + return P(); + } + + using a = int (*)(int*, int*); + using a = decltype(non_dependent_typedef()); + using a = decltype(non_dependent_alias()); + using a = decltype(non_dependent_via_sizeof()); + + using b = decltype(non_dependent_via_sizeof()); // expected-note {{instantiation of}} +} diff --git a/clang/test/SemaTemplate/class-template-decl.cpp b/clang/test/SemaTemplate/class-template-decl.cpp index dd9dcd2de9f4c..453218ac3b40e 100644 --- a/clang/test/SemaTemplate/class-template-decl.cpp +++ b/clang/test/SemaTemplate/class-template-decl.cpp @@ -1,17 +1,26 @@ // RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s -template class A; +template class A {}; extern "C++" { - template class B; + template class B {}; + template class A; + template<> class A; + template class A; + template class B; + template<> class B; + template class B; } namespace N { template class C; } -extern "C" { // expected-note {{extern "C" language linkage specification begins here}} +extern "C" { // expected-note 3 {{extern "C" language linkage specification begins here}} template class D; // expected-error{{templates must have C++ linkage}} + template class A; // expected-error{{templates must have C++ linkage}} + template<> class A; // expected-error{{templates must have C++ linkage}} + template class A; // OK (surprisingly) FIXME: Should we warn on this? } extern "C" { // expected-note 2 {{extern "C" language linkage specification begins here}} diff --git a/clang/test/SemaTemplate/member-access-expr.cpp b/clang/test/SemaTemplate/member-access-expr.cpp index 36d6022577289..d6627b954a287 100644 --- a/clang/test/SemaTemplate/member-access-expr.cpp +++ b/clang/test/SemaTemplate/member-access-expr.cpp @@ -160,3 +160,14 @@ namespace test6 { } }; } + +namespace test7 { + struct C { void g(); }; + template struct A { + T x; + static void f() { + (x.g()); // expected-error {{invalid use of member 'x' in static member function}} + } + }; + void h() { A::f(); } +} diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp index fdcc5006f5890..7538de3309027 100644 --- a/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp +++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp @@ -448,3 +448,14 @@ namespace PR42108 { A(); // expected-error {{non-type template argument is not a constant expression}} expected-note 2{{temporary}} } } + +namespace PR46637 { + template auto> struct X { // expected-note {{here}} + auto call() { return f(); } + }; + X x; // expected-error {{incompatible initializer}} + + void *f(); + X y; + int n = y.call(); // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type 'void *'}} +} diff --git a/clang/test/SemaTemplate/template-decl-fail.cpp b/clang/test/SemaTemplate/template-decl-fail.cpp index ad134cdf225c2..7019caa2c2f31 100644 --- a/clang/test/SemaTemplate/template-decl-fail.cpp +++ b/clang/test/SemaTemplate/template-decl-fail.cpp @@ -4,7 +4,7 @@ template typedef T X; // expected-error{{typedef cannot be a templat template enum t0 { A = T::x }; // expected-error{{enumeration cannot be a template}} \ - // expected-warning{{declaration does not declare anything}} + // expected-error{{declaration does not declare anything}} enum e0 {}; template enum e0 f0(int a=x) {} diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index dacda6894a045..6c677eda9a093 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -91,6 +91,11 @@ ('%hmaptool', "'%s' %s" % (config.python_executable, os.path.join(config.clang_tools_dir, 'hmaptool')))) +# Strip C++ comments "//"" from tests +config.substitutions.append( + ('%strip_comments', "sed 's/[ \t]*\/\/.*//' %s") +) + # Plugins (loadable modules) if config.has_plugins and config.llvm_plugin_ext: config.available_features.add('plugins') diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp index 77b99b2013640..87047be3c2bc6 100644 --- a/clang/tools/driver/cc1as_main.cpp +++ b/clang/tools/driver/cc1as_main.cpp @@ -224,8 +224,7 @@ bool AssemblerInvocation::CreateFromArgs(AssemblerInvocation &Opts, if (const Arg *A = Args.getLastArg(OPT_compress_debug_sections, OPT_compress_debug_sections_EQ)) { if (A->getOption().getID() == OPT_compress_debug_sections) { - // TODO: be more clever about the compression type auto-detection - Opts.CompressDebugSections = llvm::DebugCompressionType::GNU; + Opts.CompressDebugSections = llvm::DebugCompressionType::Z; } else { Opts.CompressDebugSections = llvm::StringSwitch(A->getValue()) diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt index 2e750ac9ea925..2d5d0172afedc 100644 --- a/clang/unittests/AST/CMakeLists.txt +++ b/clang/unittests/AST/CMakeLists.txt @@ -3,9 +3,6 @@ set(LLVM_LINK_COMPONENTS Support ) -if (MSVC) - set_source_files_properties(ASTImporterTest.cpp PROPERTIES COMPILE_FLAGS /bigobj) -endif() add_clang_unittest(ASTTests ASTContextParentMapTest.cpp diff --git a/clang/unittests/ASTMatchers/CMakeLists.txt b/clang/unittests/ASTMatchers/CMakeLists.txt index c9ac6c57d6897..b40b3886dcd2b 100644 --- a/clang/unittests/ASTMatchers/CMakeLists.txt +++ b/clang/unittests/ASTMatchers/CMakeLists.txt @@ -3,15 +3,6 @@ set(LLVM_LINK_COMPONENTS Support ) -# By default MSVC has a 2^16 limit on the number of sections in an object file, -# and this needs more than that. -if (MSVC) - set_source_files_properties(InternalASTMatchersTest.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(NodeMatchersTest.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(NarrowingMatchersTest.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(ASTTraversalMatchersTest.cpp PROPERTIES COMPILE_FLAGS /bigobj) -endif() - add_clang_unittest(ASTMatchersTests ASTMatchersInternalTest.cpp ASTMatchersNodeTest.cpp diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index 9a52b9fb02620..4c222e24599f0 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -1,10 +1,6 @@ add_custom_target(ClangUnitTests) set_target_properties(ClangUnitTests PROPERTIES FOLDER "Clang tests") -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - if(CLANG_BUILT_STANDALONE) # LLVMTestingSupport library is needed for some of the unittests. if (EXISTS ${LLVM_MAIN_SRC_DIR}/lib/Testing/Support diff --git a/clang/unittests/CodeGen/IncrementalProcessingTest.cpp b/clang/unittests/CodeGen/IncrementalProcessingTest.cpp index 045ed9bbc7606..d1d921bb03c69 100644 --- a/clang/unittests/CodeGen/IncrementalProcessingTest.cpp +++ b/clang/unittests/CodeGen/IncrementalProcessingTest.cpp @@ -159,6 +159,11 @@ TEST(IncrementalProcessing, EmitCXXGlobalInitFunc) { // First code should not end up in second module: ASSERT_FALSE(M[2]->getFunction("funcForProg1")); + // TODO: Remove this after the static initialization frontend implementation + // is recovered on AIX. + if (compiler.getTarget().getTriple().isOSAIX()) + return; + // Make sure global inits exist and are unique: const Function* GlobalInit1 = getGlobalInit(*M[1]); ASSERT_TRUE(GlobalInit1); diff --git a/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp b/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp index 7c151c1821133..e67dcacca0a9b 100644 --- a/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp +++ b/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp @@ -16,7 +16,7 @@ #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" #include "clang/StaticAnalyzer/Frontend/AnalysisConsumer.h" #include "clang/StaticAnalyzer/Frontend/CheckerRegistry.h" -#include "llvm/Config/config.h" +#include "llvm/Config/llvm-config.h" #include "gtest/gtest.h" // FIXME: Use GTEST_SKIP() instead if GTest is updated to version 1.10.0 diff --git a/clang/unittests/Tooling/CMakeLists.txt b/clang/unittests/Tooling/CMakeLists.txt index c439f5a786375..f290c3d2bedee 100644 --- a/clang/unittests/Tooling/CMakeLists.txt +++ b/clang/unittests/Tooling/CMakeLists.txt @@ -4,14 +4,6 @@ set(LLVM_LINK_COMPONENTS Support ) -# By default MSVC has a 2^16 limit on the number of sections in an object file, -# and this needs more than that. -if (MSVC) - set_source_files_properties(RecursiveASTVisitorTest.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(RecursiveASTVisitorTestExprVisitor.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(RecursiveASTVisitorTests/Callbacks.cpp PROPERTIES COMPILE_FLAGS /bigobj) - set_source_files_properties(SourceCodeTest.cpp PROPERTIES COMPILE_FLAGS /bigobj) -endif() add_clang_unittest(ToolingTests ASTSelectionTest.cpp diff --git a/clang/unittests/Tooling/RangeSelectorTest.cpp b/clang/unittests/Tooling/RangeSelectorTest.cpp index e2d7723eab116..64ddee7894eb9 100644 --- a/clang/unittests/Tooling/RangeSelectorTest.cpp +++ b/clang/unittests/Tooling/RangeSelectorTest.cpp @@ -193,8 +193,33 @@ TEST(RangeSelectorTest, AfterOp) { HasValue(EqualsCharSourceRange(ExpectedAfter))); } +TEST(RangeSelectorTest, BetweenOp) { + StringRef Code = R"cc( + int f(int x, int y, int z) { return 3; } + int g() { return f(3, /* comment */ 7 /* comment */, 9); } + )cc"; + auto Matcher = callExpr(hasArgument(0, expr().bind("a0")), + hasArgument(1, expr().bind("a1"))); + RangeSelector R = between(node("a0"), node("a1")); + TestMatch Match = matchCode(Code, Matcher); + EXPECT_THAT_EXPECTED(select(R, Match), HasValue(", /* comment */ ")); +} + +TEST(RangeSelectorTest, BetweenOpParsed) { + StringRef Code = R"cc( + int f(int x, int y, int z) { return 3; } + int g() { return f(3, /* comment */ 7 /* comment */, 9); } + )cc"; + auto Matcher = callExpr(hasArgument(0, expr().bind("a0")), + hasArgument(1, expr().bind("a1"))); + auto R = parseRangeSelector(R"rs(between(node("a0"), node("a1")))rs"); + ASSERT_THAT_EXPECTED(R, llvm::Succeeded()); + TestMatch Match = matchCode(Code, Matcher); + EXPECT_THAT_EXPECTED(select(*R, Match), HasValue(", /* comment */ ")); +} + // Node-id specific version. -TEST(RangeSelectorTest, RangeOpNodes) { +TEST(RangeSelectorTest, EncloseOpNodes) { StringRef Code = R"cc( int f(int x, int y, int z) { return 3; } int g() { return f(/* comment */ 3, 7 /* comment */, 9); } @@ -206,7 +231,7 @@ TEST(RangeSelectorTest, RangeOpNodes) { EXPECT_THAT_EXPECTED(select(R, Match), HasValue("3, 7")); } -TEST(RangeSelectorTest, RangeOpGeneral) { +TEST(RangeSelectorTest, EncloseOpGeneral) { StringRef Code = R"cc( int f(int x, int y, int z) { return 3; } int g() { return f(/* comment */ 3, 7 /* comment */, 9); } @@ -218,7 +243,7 @@ TEST(RangeSelectorTest, RangeOpGeneral) { EXPECT_THAT_EXPECTED(select(R, Match), HasValue("3, 7")); } -TEST(RangeSelectorTest, RangeOpNodesParsed) { +TEST(RangeSelectorTest, EncloseOpNodesParsed) { StringRef Code = R"cc( int f(int x, int y, int z) { return 3; } int g() { return f(/* comment */ 3, 7 /* comment */, 9); } @@ -231,7 +256,7 @@ TEST(RangeSelectorTest, RangeOpNodesParsed) { EXPECT_THAT_EXPECTED(select(*R, Match), HasValue("3, 7")); } -TEST(RangeSelectorTest, RangeOpGeneralParsed) { +TEST(RangeSelectorTest, EncloseOpGeneralParsed) { StringRef Code = R"cc( int f(int x, int y, int z) { return 3; } int g() { return f(/* comment */ 3, 7 /* comment */, 9); } diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp index bd639aa581661..a722ca2b1a45a 100644 --- a/clang/unittests/Tooling/Syntax/TreeTest.cpp +++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp @@ -35,6 +35,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include +#include using namespace clang; @@ -59,22 +60,25 @@ class SyntaxTreeTest : public ::testing::Test, class BuildSyntaxTree : public ASTConsumer { public: BuildSyntaxTree(syntax::TranslationUnit *&Root, + std::unique_ptr &TB, std::unique_ptr &Arena, std::unique_ptr Tokens) - : Root(Root), Arena(Arena), Tokens(std::move(Tokens)) { + : Root(Root), TB(TB), Arena(Arena), Tokens(std::move(Tokens)) { assert(this->Tokens); } void HandleTranslationUnit(ASTContext &Ctx) override { - Arena = std::make_unique(Ctx.getSourceManager(), - Ctx.getLangOpts(), - std::move(*Tokens).consume()); + TB = + std::make_unique(std::move(*Tokens).consume()); Tokens = nullptr; // make sure we fail if this gets called twice. + Arena = std::make_unique(Ctx.getSourceManager(), + Ctx.getLangOpts(), *TB); Root = syntax::buildSyntaxTree(*Arena, *Ctx.getTranslationUnitDecl()); } private: syntax::TranslationUnit *&Root; + std::unique_ptr &TB; std::unique_ptr &Arena; std::unique_ptr Tokens; }; @@ -82,20 +86,22 @@ class SyntaxTreeTest : public ::testing::Test, class BuildSyntaxTreeAction : public ASTFrontendAction { public: BuildSyntaxTreeAction(syntax::TranslationUnit *&Root, + std::unique_ptr &TB, std::unique_ptr &Arena) - : Root(Root), Arena(Arena) {} + : Root(Root), TB(TB), Arena(Arena) {} std::unique_ptr CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override { // We start recording the tokens, ast consumer will take on the result. auto Tokens = std::make_unique(CI.getPreprocessor()); - return std::make_unique(Root, Arena, + return std::make_unique(Root, TB, Arena, std::move(Tokens)); } private: syntax::TranslationUnit *&Root; + std::unique_ptr &TB; std::unique_ptr &Arena; }; @@ -132,7 +138,7 @@ class SyntaxTreeTest : public ::testing::Test, Compiler.setSourceManager(SourceMgr.get()); syntax::TranslationUnit *Root = nullptr; - BuildSyntaxTreeAction Recorder(Root, this->Arena); + BuildSyntaxTreeAction Recorder(Root, this->TB, this->Arena); // Action could not be executed but the frontend didn't identify any errors // in the code ==> problem in setting up the action. @@ -204,6 +210,7 @@ class SyntaxTreeTest : public ::testing::Test, new SourceManager(*Diags, *FileMgr); std::shared_ptr Invocation; // Set after calling buildTree(). + std::unique_ptr TB; std::unique_ptr Arena; }; diff --git a/clang/unittests/Tooling/ToolingTest.cpp b/clang/unittests/Tooling/ToolingTest.cpp index 5bd2864b5ba1e..cc6f453284d71 100644 --- a/clang/unittests/Tooling/ToolingTest.cpp +++ b/clang/unittests/Tooling/ToolingTest.cpp @@ -621,7 +621,7 @@ TEST(addTargetAndModeForProgramName, AddsTargetAndMode) { addTargetAndModeForProgramName(Args, ""); EXPECT_EQ((std::vector{"clang", "-foo"}), Args); addTargetAndModeForProgramName(Args, Target + "-g++"); - EXPECT_EQ((std::vector{"clang", "-target", Target, + EXPECT_EQ((std::vector{"clang", "--target=" + Target, "--driver-mode=g++", "-foo"}), Args); } @@ -635,7 +635,7 @@ TEST(addTargetAndModeForProgramName, PathIgnored) { std::vector Args = {"clang", "-foo"}; addTargetAndModeForProgramName(Args, ToolPath); - EXPECT_EQ((std::vector{"clang", "-target", Target, + EXPECT_EQ((std::vector{"clang", "--target=" + Target, "--driver-mode=g++", "-foo"}), Args); } @@ -650,10 +650,10 @@ TEST(addTargetAndModeForProgramName, IgnoresExistingTarget) { "-target", "something"}), Args); - std::vector ArgsAlt = {"clang", "-foo", "-target=something"}; + std::vector ArgsAlt = {"clang", "-foo", "--target=something"}; addTargetAndModeForProgramName(ArgsAlt, Target + "-g++"); EXPECT_EQ((std::vector{"clang", "--driver-mode=g++", "-foo", - "-target=something"}), + "--target=something"}), ArgsAlt); } @@ -663,15 +663,9 @@ TEST(addTargetAndModeForProgramName, IgnoresExistingMode) { std::vector Args = {"clang", "-foo", "--driver-mode=abc"}; addTargetAndModeForProgramName(Args, Target + "-g++"); - EXPECT_EQ((std::vector{"clang", "-target", Target, "-foo", + EXPECT_EQ((std::vector{"clang", "--target=" + Target, "-foo", "--driver-mode=abc"}), Args); - - std::vector ArgsAlt = {"clang", "-foo", "--driver-mode", "abc"}; - addTargetAndModeForProgramName(ArgsAlt, Target + "-g++"); - EXPECT_EQ((std::vector{"clang", "-target", Target, "-foo", - "--driver-mode", "abc"}), - ArgsAlt); } #ifndef _WIN32 diff --git a/clang/unittests/Tooling/TransformerTest.cpp b/clang/unittests/Tooling/TransformerTest.cpp index 59b334b0ea5a4..77fd380410b27 100644 --- a/clang/unittests/Tooling/TransformerTest.cpp +++ b/clang/unittests/Tooling/TransformerTest.cpp @@ -10,6 +10,7 @@ #include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Tooling/Tooling.h" #include "clang/Tooling/Transformer/RangeSelector.h" +#include "clang/Tooling/Transformer/RewriteRule.h" #include "clang/Tooling/Transformer/Stencil.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" @@ -113,7 +114,9 @@ class ClangRefactoringTestBase : public testing::Test { if (C) { Changes.push_back(std::move(*C)); } else { - consumeError(C.takeError()); + // FIXME: stash this error rather then printing. + llvm::errs() << "Error generating changes: " + << llvm::toString(C.takeError()) << "\n"; ++ErrorCount; } }; @@ -378,6 +381,155 @@ TEST_F(TransformerTest, NodePartMemberMultiToken) { Input, Expected); } +TEST_F(TransformerTest, NoEdits) { + using transformer::noEdits; + std::string Input = "int f(int x) { return x; }"; + testRule(makeRule(returnStmt().bind("return"), noEdits()), Input, Input); +} + +TEST_F(TransformerTest, IfBound2Args) { + using transformer::ifBound; + std::string Input = "int f(int x) { return x; }"; + std::string Expected = "int f(int x) { CHANGE; }"; + testRule(makeRule(returnStmt().bind("return"), + ifBound("return", changeTo(cat("CHANGE;")))), + Input, Expected); +} + +TEST_F(TransformerTest, IfBound3Args) { + using transformer::ifBound; + std::string Input = "int f(int x) { return x; }"; + std::string Expected = "int f(int x) { CHANGE; }"; + testRule(makeRule(returnStmt().bind("return"), + ifBound("nothing", changeTo(cat("ERROR")), + changeTo(cat("CHANGE;")))), + Input, Expected); +} + +TEST_F(TransformerTest, ShrinkTo) { + using transformer::shrinkTo; + std::string Input = "int f(int x) { return x; }"; + std::string Expected = "return x;"; + testRule(makeRule(functionDecl(hasDescendant(returnStmt().bind("return"))) + .bind("function"), + shrinkTo(node("function"), node("return"))), + Input, Expected); +} + +// Rewrite various Stmts inside a Decl. +TEST_F(TransformerTest, RewriteDescendantsDeclChangeStmt) { + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + std::string Expected = + "int f(int x) { int y = 3; { int z = 3 * 3; } return 3; }"; + auto InlineX = + makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); + testRule(makeRule(functionDecl(hasName("f")).bind("fun"), + rewriteDescendants("fun", InlineX)), + Input, Expected); +} + +// Rewrite various TypeLocs inside a Decl. +TEST_F(TransformerTest, RewriteDescendantsDeclChangeTypeLoc) { + std::string Input = "int f(int *x) { return *x; }"; + std::string Expected = "char f(char *x) { return *x; }"; + auto IntToChar = makeRule(typeLoc(loc(qualType(isInteger(), builtinType()))), + changeTo(cat("char"))); + testRule(makeRule(functionDecl(hasName("f")).bind("fun"), + rewriteDescendants("fun", IntToChar)), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsStmt) { + // Add an unrelated definition to the header that also has a variable named + // "x", to test that the rewrite is limited to the scope we intend. + appendToHeader(R"cc(int g(int x) { return x; })cc"); + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + std::string Expected = + "int f(int x) { int y = 3; { int z = 3 * 3; } return 3; }"; + auto InlineX = + makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); + testRule(makeRule(functionDecl(hasName("f"), hasBody(stmt().bind("body"))), + rewriteDescendants("body", InlineX)), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsStmtWithAdditionalChange) { + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + std::string Expected = + "int newName(int x) { int y = 3; { int z = 3 * 3; } return 3; }"; + auto InlineX = + makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); + testRule( + makeRule( + functionDecl(hasName("f"), hasBody(stmt().bind("body"))).bind("f"), + flatten(changeTo(name("f"), cat("newName")), + rewriteDescendants("body", InlineX))), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsTypeLoc) { + std::string Input = "int f(int *x) { return *x; }"; + std::string Expected = "int f(char *x) { return *x; }"; + auto IntToChar = + makeRule(typeLoc(loc(qualType(isInteger(), builtinType()))).bind("loc"), + changeTo(cat("char"))); + testRule( + makeRule(functionDecl(hasName("f"), + hasParameter(0, varDecl(hasTypeLoc( + typeLoc().bind("parmType"))))), + rewriteDescendants("parmType", IntToChar)), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsReferToParentBinding) { + std::string Input = + "int f(int p) { int y = p; { int z = p * p; } return p; }"; + std::string Expected = + "int f(int p) { int y = 3; { int z = 3 * 3; } return 3; }"; + std::string VarId = "var"; + auto InlineVar = makeRule(declRefExpr(to(varDecl(equalsBoundNode(VarId)))), + changeTo(cat("3"))); + testRule(makeRule(functionDecl(hasName("f"), + hasParameter(0, varDecl().bind(VarId))) + .bind("fun"), + rewriteDescendants("fun", InlineVar)), + Input, Expected); +} + +TEST_F(TransformerTest, RewriteDescendantsUnboundNode) { + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + auto InlineX = + makeRule(declRefExpr(to(varDecl(hasName("x")))), changeTo(cat("3"))); + Transformer T(makeRule(functionDecl(hasName("f")), + rewriteDescendants("UNBOUND", InlineX)), + consumer()); + T.registerMatchers(&MatchFinder); + EXPECT_FALSE(rewrite(Input)); + EXPECT_THAT(Changes, IsEmpty()); + EXPECT_EQ(ErrorCount, 1); +} + +TEST_F(TransformerTest, RewriteDescendantsInvalidNodeType) { + std::string Input = + "int f(int x) { int y = x; { int z = x * x; } return x; }"; + auto IntToChar = + makeRule(qualType(isInteger(), builtinType()), changeTo(cat("char"))); + Transformer T( + makeRule(functionDecl( + hasName("f"), + hasParameter(0, varDecl(hasType(qualType().bind("type"))))), + rewriteDescendants("type", IntToChar)), + consumer()); + T.registerMatchers(&MatchFinder); + EXPECT_FALSE(rewrite(Input)); + EXPECT_THAT(Changes, IsEmpty()); + EXPECT_EQ(ErrorCount, 1); +} + TEST_F(TransformerTest, InsertBeforeEdit) { std::string Input = R"cc( int f() { @@ -497,6 +649,90 @@ TEST_F(TransformerTest, MultiChange) { Input, Expected); } +TEST_F(TransformerTest, EditList) { + using clang::transformer::editList; + std::string Input = R"cc( + void foo() { + if (10 > 1.0) + log(1) << "oh no!"; + else + log(0) << "ok"; + } + )cc"; + std::string Expected = R"( + void foo() { + if (true) { /* then */ } + else { /* else */ } + } + )"; + + StringRef C = "C", T = "T", E = "E"; + testRule(makeRule(ifStmt(hasCondition(expr().bind(C)), + hasThen(stmt().bind(T)), hasElse(stmt().bind(E))), + editList({changeTo(node(std::string(C)), cat("true")), + changeTo(statement(std::string(T)), + cat("{ /* then */ }")), + changeTo(statement(std::string(E)), + cat("{ /* else */ }"))})), + Input, Expected); +} + +TEST_F(TransformerTest, Flatten) { + using clang::transformer::editList; + std::string Input = R"cc( + void foo() { + if (10 > 1.0) + log(1) << "oh no!"; + else + log(0) << "ok"; + } + )cc"; + std::string Expected = R"( + void foo() { + if (true) { /* then */ } + else { /* else */ } + } + )"; + + StringRef C = "C", T = "T", E = "E"; + testRule( + makeRule( + ifStmt(hasCondition(expr().bind(C)), hasThen(stmt().bind(T)), + hasElse(stmt().bind(E))), + flatten(changeTo(node(std::string(C)), cat("true")), + changeTo(statement(std::string(T)), cat("{ /* then */ }")), + changeTo(statement(std::string(E)), cat("{ /* else */ }")))), + Input, Expected); +} + +TEST_F(TransformerTest, FlattenWithMixedArgs) { + using clang::transformer::editList; + std::string Input = R"cc( + void foo() { + if (10 > 1.0) + log(1) << "oh no!"; + else + log(0) << "ok"; + } + )cc"; + std::string Expected = R"( + void foo() { + if (true) { /* then */ } + else { /* else */ } + } + )"; + + StringRef C = "C", T = "T", E = "E"; + testRule(makeRule(ifStmt(hasCondition(expr().bind(C)), + hasThen(stmt().bind(T)), hasElse(stmt().bind(E))), + flatten(changeTo(node(std::string(C)), cat("true")), + edit(changeTo(statement(std::string(T)), + cat("{ /* then */ }"))), + editList({changeTo(statement(std::string(E)), + cat("{ /* else */ }"))}))), + Input, Expected); +} + TEST_F(TransformerTest, OrderedRuleUnrelated) { StringRef Flag = "flag"; RewriteRule FlagRule = makeRule( diff --git a/clang/utils/analyzer/SATest.py b/clang/utils/analyzer/SATest.py index 46e636ad2895e..86571902502f9 100755 --- a/clang/utils/analyzer/SATest.py +++ b/clang/utils/analyzer/SATest.py @@ -78,7 +78,7 @@ def update(parser, args): project_map = ProjectMap() for project in project_map.projects: - SATestUpdateDiffs.update_reference_results(project) + SATestUpdateDiffs.update_reference_results(project, args.git) def benchmark(parser, args): @@ -277,7 +277,8 @@ def main(): "update", help="Update static analyzer reference results based on the previous " "run of SATest build. Assumes that SATest build was just run.") - # TODO: add option to decide whether we should use git + upd_parser.add_argument("--git", action="store_true", + help="Stage updated results using git.") upd_parser.set_defaults(func=update) # docker subcommand diff --git a/clang/utils/analyzer/SATestUpdateDiffs.py b/clang/utils/analyzer/SATestUpdateDiffs.py index 920fa15e4c6f5..69b3383beaf17 100644 --- a/clang/utils/analyzer/SATestUpdateDiffs.py +++ b/clang/utils/analyzer/SATestUpdateDiffs.py @@ -15,7 +15,7 @@ Verbose = 0 -def update_reference_results(project: ProjectInfo): +def update_reference_results(project: ProjectInfo, git: bool = False): test_info = SATestBuild.TestInfo(project) tester = SATestBuild.ProjectTester(test_info) project_dir = tester.get_project_dir() @@ -27,9 +27,10 @@ def update_reference_results(project: ProjectInfo): created_results_path = tester.get_output_dir() if not os.path.exists(created_results_path): - print("New results not found, was SATestBuild.py previously run?", + print(f"Skipping project '{project.name}', " + f"it doesn't have newer results.", file=sys.stderr) - sys.exit(1) + return build_log_path = SATestBuild.get_build_log_path(ref_results_path) build_log_dir = os.path.dirname(os.path.abspath(build_log_path)) @@ -45,7 +46,8 @@ def run_cmd(command: str): # Remove reference results: in git, and then again for a good measure # with rm, as git might not remove things fully if there are empty # directories involved. - run_cmd(f"git rm -r -q '{ref_results_path}'") + if git: + run_cmd(f"git rm -r -q '{ref_results_path}'") shutil.rmtree(ref_results_path) # Replace reference results with a freshly computed once. @@ -60,22 +62,11 @@ def run_cmd(command: str): # Clean up the generated difference results. SATestBuild.cleanup_reference_results(ref_results_path) - run_cmd(f"git add '{ref_results_path}'") + if git: + run_cmd(f"git add '{ref_results_path}'") -# TODO: use argparse -def main(argv): - if len(argv) == 2 and argv[1] in ("-h", "--help"): - print("Update static analyzer reference results based " - "\non the previous run of SATestBuild.py.\n" - "\nN.B.: Assumes that SATestBuild.py was just run", - file=sys.stderr) - sys.exit(1) - - project_map = ProjectMap() - for project in project_map.projects: - update_reference_results(project) - - -if __name__ == '__main__': - main(sys.argv) +if __name__ == "__main__": + print("SATestUpdateDiffs.py should not be used on its own.") + print("Please use 'SATest.py update' instead") + sys.exit(1) diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index c7369525c36fb..74319b1389437 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -13633,7 +13633,7 @@

C++ defect report implementation status

2303 DRWP Partial ordering and recursive variadic inheritance - Unknown + Clang 12 2304 diff --git a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake index 07b589beb2d10..0b679dbf68fae 100644 --- a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake +++ b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake @@ -70,29 +70,36 @@ function(clang_compile object_file source) if (TARGET CompilerRTUnitTestCheckCxx) list(APPEND SOURCE_DEPS CompilerRTUnitTestCheckCxx) endif() - string(REGEX MATCH "[.](cc|cpp)$" is_cxx ${source_rpath}) - string(REGEX MATCH "[.](m|mm)$" is_objc ${source_rpath}) - if(is_cxx) - string(REPLACE " " ";" global_flags "${CMAKE_CXX_FLAGS}") - else() - string(REPLACE " " ";" global_flags "${CMAKE_C_FLAGS}") - endif() + if(COMPILER_RT_STANDALONE_BUILD) + # Only add global flags in standalone build. + string(REGEX MATCH "[.](cc|cpp)$" is_cxx ${source_rpath}) + if(is_cxx) + string(REPLACE " " ";" global_flags "${CMAKE_CXX_FLAGS}") + else() + string(REPLACE " " ";" global_flags "${CMAKE_C_FLAGS}") + endif() - if (MSVC) - translate_msvc_cflags(global_flags "${global_flags}") - endif() + if (MSVC) + translate_msvc_cflags(global_flags "${global_flags}") + endif() - if (APPLE) - set(global_flags ${OSX_SYSROOT_FLAG} ${global_flags}) + if (APPLE) + set(global_flags ${OSX_SYSROOT_FLAG} ${global_flags}) + endif() + + # Ignore unknown warnings. CMAKE_CXX_FLAGS may contain GCC-specific options + # which are not supported by Clang. + list(APPEND global_flags -Wno-unknown-warning-option) + set(compile_flags ${global_flags} ${SOURCE_CFLAGS}) + else() + set(compile_flags ${SOURCE_CFLAGS}) endif() + + string(REGEX MATCH "[.](m|mm)$" is_objc ${source_rpath}) if (is_objc) - list(APPEND global_flags -ObjC) + list(APPEND compile_flags "-ObjC") endif() - # Ignore unknown warnings. CMAKE_CXX_FLAGS may contain GCC-specific options - # which are not supported by Clang. - list(APPEND global_flags -Wno-unknown-warning-option) - set(compile_flags ${global_flags} ${SOURCE_CFLAGS}) add_custom_command( OUTPUT ${object_file} COMMAND ${COMPILER_RT_TEST_COMPILER} ${compile_flags} -c diff --git a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake index be8d7e733c7a0..a31bcc3963e17 100644 --- a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake +++ b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake @@ -289,6 +289,15 @@ macro(darwin_add_builtin_library name suffix) endforeach(cflag) endif() + if ("${LIB_OS}" MATCHES ".*sim$") + # Pass an explicit -simulator environment to the -target option to ensure + # that we don't rely on the architecture to infer whether we're building + # for the simulator. + string(REGEX REPLACE "sim" "" base_os "${LIB_OS}") + list(APPEND builtin_cflags + -target "${LIB_ARCH}-apple-${base_os}${DARWIN_${LIBOS}_BUILTIN_MIN_VER}-simulator") + endif() + set_target_compile_flags(${libname} ${sysroot_flag} ${DARWIN_${LIB_OS}_BUILTIN_MIN_VER_FLAG} @@ -389,7 +398,8 @@ macro(darwin_add_builtin_libraries) set(PROFILE_SOURCES ../profile/InstrProfiling ../profile/InstrProfilingBuffer ../profile/InstrProfilingPlatformDarwin - ../profile/InstrProfilingWriter) + ../profile/InstrProfilingWriter + ../profile/InstrProfilingInternal) foreach (os ${ARGN}) list_intersect(DARWIN_BUILTIN_ARCHS DARWIN_${os}_BUILTIN_ARCHS BUILTIN_SUPPORTED_ARCH) foreach (arch ${DARWIN_BUILTIN_ARCHS}) @@ -449,16 +459,13 @@ macro(darwin_add_builtin_libraries) endif() endforeach() - # We put the x86 sim slices into the archives for their base OS foreach (os ${ARGN}) - if(NOT ${os} MATCHES ".*sim$") - darwin_lipo_libs(clang_rt.${os} - PARENT_TARGET builtins - LIPO_FLAGS ${${os}_builtins_lipo_flags} ${${os}sim_builtins_lipo_flags} - DEPENDS ${${os}_builtins_libs} ${${os}sim_builtins_libs} - OUTPUT_DIR ${COMPILER_RT_LIBRARY_OUTPUT_DIR} - INSTALL_DIR ${COMPILER_RT_LIBRARY_INSTALL_DIR}) - endif() + darwin_lipo_libs(clang_rt.${os} + PARENT_TARGET builtins + LIPO_FLAGS ${${os}_builtins_lipo_flags} + DEPENDS ${${os}_builtins_libs} + OUTPUT_DIR ${COMPILER_RT_LIBRARY_OUTPUT_DIR} + INSTALL_DIR ${COMPILER_RT_LIBRARY_INSTALL_DIR}) endforeach() darwin_add_embedded_builtin_libraries() endmacro() diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index 0a27910ed4943..74fef8933ef90 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -157,6 +157,19 @@ check_library_exists(stdc++ __cxa_throw "" COMPILER_RT_HAS_LIBSTDCXX) check_linker_flag("-Wl,-z,text" COMPILER_RT_HAS_Z_TEXT) check_linker_flag("-fuse-ld=lld" COMPILER_RT_HAS_FUSE_LD_LLD_FLAG) +set(VERS_COMPAT_OPTION "-Wl,-z,gnu-version-script-compat") +check_linker_flag("${VERS_COMPAT_OPTION}" COMPILER_RT_HAS_GNU_VERSION_SCRIPT_COMPAT) + +set(DUMMY_VERS ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/dummy.vers) +file(WRITE ${DUMMY_VERS} "{};") +set(VERS_OPTION "-Wl,--version-script,${DUMMY_VERS}") +if(COMPILER_RT_HAS_GNU_VERSION_SCRIPT_COMPAT) + # Solaris 11.4 ld only supports --version-script with + # -z gnu-version-script-compat. + string(APPEND VERS_OPTION " ${VERS_COMPAT_OPTION}") +endif() +check_linker_flag("${VERS_OPTION}" COMPILER_RT_HAS_VERSION_SCRIPT) + if(ANDROID) check_linker_flag("-Wl,-z,global" COMPILER_RT_HAS_Z_GLOBAL) check_library_exists(log __android_log_write "" COMPILER_RT_HAS_LIBLOG) diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index a6913527e67f0..6d0ffb12294b0 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -157,6 +157,8 @@ VALUE_PROF_FUNC_PARAM(void *, Data, Type::getInt8PtrTy(Ctx)) INSTR_PROF_COMMA #ifndef VALUE_RANGE_PROF VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) #else /* VALUE_RANGE_PROF */ +/* FIXME: This is to be removed after switching to the new memop value + * profiling. */ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) \ INSTR_PROF_COMMA VALUE_PROF_FUNC_PARAM(uint64_t, PreciseRangeStart, Type::getInt64Ty(Ctx)) \ @@ -753,9 +755,14 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_VALUE_PROF_FUNC __llvm_profile_instrument_target #define INSTR_PROF_VALUE_PROF_FUNC_STR \ INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_FUNC) +/* FIXME: This is to be removed after switching to the new memop value + * profiling. */ #define INSTR_PROF_VALUE_RANGE_PROF_FUNC __llvm_profile_instrument_range #define INSTR_PROF_VALUE_RANGE_PROF_FUNC_STR \ INSTR_PROF_QUOTE(INSTR_PROF_VALUE_RANGE_PROF_FUNC) +#define INSTR_PROF_VALUE_PROF_MEMOP_FUNC __llvm_profile_instrument_memop +#define INSTR_PROF_VALUE_PROF_MEMOP_FUNC_STR \ + INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_MEMOP_FUNC) /* InstrProfile per-function control data alignment. */ #define INSTR_PROF_DATA_ALIGNMENT 8 @@ -783,3 +790,121 @@ typedef struct InstrProfValueData { #endif #undef COVMAP_V2_OR_V3 + +#ifdef INSTR_PROF_VALUE_PROF_MEMOP_API + +#ifdef __cplusplus +#define INSTR_PROF_INLINE inline +#else +#define INSTR_PROF_INLINE +#endif + +/* The value range buckets (22 buckets) for the memop size value profiling looks + * like: + * + * [0, 0] + * [1, 1] + * [2, 2] + * [3, 3] + * [4, 4] + * [5, 5] + * [6, 6] + * [7, 7] + * [8, 8] + * [9, 15] + * [16, 16] + * [17, 31] + * [32, 32] + * [33, 63] + * [64, 64] + * [65, 127] + * [128, 128] + * [129, 255] + * [256, 256] + * [257, 511] + * [512, 512] + * [513, UINT64_MAX] + * + * Each range has a 'representative value' which is the lower end value of the + * range and used to store in the runtime profile data records and the VP + * metadata. For example, it's 2 for [2, 2] and 64 for [65, 127]. + */ + +/* + * Clz and Popcount. This code was copied from + * compiler-rt/lib/fuzzer/{FuzzerBuiltins.h,FuzzerBuiltinsMsvc.h} and + * llvm/include/llvm/Support/MathExtras.h. + */ +#if defined(_MSC_VER) && !defined(__clang__) + +#include +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE +int InstProfClzll(unsigned long long X) { + unsigned long LeadZeroIdx = 0; +#if !defined(_M_ARM64) && !defined(_M_X64) + // Scan the high 32 bits. + if (_BitScanReverse(&LeadZeroIdx, (unsigned long)(X >> 32))) + return (int)(63 - (LeadZeroIdx + 32)); // Create a bit offset + // from the MSB. + // Scan the low 32 bits. + if (_BitScanReverse(&LeadZeroIdx, (unsigned long)(X))) + return (int)(63 - LeadZeroIdx); +#else + if (_BitScanReverse64(&LeadZeroIdx, X)) return 63 - LeadZeroIdx; +#endif + return 64; +} +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE +int InstProfPopcountll(unsigned long long X) { + // This code originates from https://reviews.llvm.org/rG30626254510f. + unsigned long long v = X; + v = v - ((v >> 1) & 0x5555555555555555ULL); + v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); + v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return (int)((unsigned long long)(v * 0x0101010101010101ULL) >> 56); +} + +#else + +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE +int InstProfClzll(unsigned long long X) { return __builtin_clzll(X); } +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE +int InstProfPopcountll(unsigned long long X) { return __builtin_popcountll(X); } + +#endif /* defined(_MSC_VER) && !defined(__clang__) */ + +/* Map an (observed) memop size value to the representative value of its range. + * For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */ +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint64_t +InstrProfGetRangeRepValue(uint64_t Value) { + if (Value <= 8) + // The first ranges are individually tracked. Use the value as is. + return Value; + else if (Value >= 513) + // The last range is mapped to its lowest value. + return 513; + else if (InstProfPopcountll(Value) == 1) + // If it's a power of two, use it as is. + return Value; + else + // Otherwise, take to the previous power of two + 1. + return (1 << (64 - InstProfClzll(Value) - 1)) + 1; +} + +/* Return true if the range that an (observed) memop size value belongs to has + * only a single value in the range. For example, 0 -> true, 8 -> true, 10 -> + * false, 64 -> true, 100 -> false, 513 -> false. */ +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE unsigned +InstrProfIsSingleValRange(uint64_t Value) { + if (Value <= 8) + // The first ranges are individually tracked. + return 1; + else if (InstProfPopcountll(Value) == 1) + // If it's a power of two, there's only one value. + return 1; + else + // Otherwise, there's more than one value in the range. + return 0; +} + +#endif /* INSTR_PROF_VALUE_PROF_MEMOP_API */ diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt index 2a1bbb58cce41..41e889eb79e6f 100644 --- a/compiler-rt/lib/asan/CMakeLists.txt +++ b/compiler-rt/lib/asan/CMakeLists.txt @@ -224,7 +224,7 @@ else() PARENT_TARGET asan) foreach(arch ${ASAN_SUPPORTED_ARCH}) - if (UNIX) + if (COMPILER_RT_HAS_VERSION_SCRIPT) add_sanitizer_rt_version_list(clang_rt.asan-dynamic-${arch} LIBS clang_rt.asan-${arch} clang_rt.asan_cxx-${arch} EXTRA asan.syms.extra) @@ -232,7 +232,7 @@ else() -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/clang_rt.asan-dynamic-${arch}.vers) # The Solaris 11.4 linker supports a subset of GNU ld version scripts, # but requires a special option to enable it. - if (OS_NAME MATCHES "SunOS") + if (COMPILER_RT_HAS_GNU_VERSION_SCRIPT_COMPAT) list(APPEND VERSION_SCRIPT_FLAG -Wl,-z,gnu-version-script-compat) endif() set_property(SOURCE diff --git a/compiler-rt/lib/asan/asan_allocator.cpp b/compiler-rt/lib/asan/asan_allocator.cpp index 126d26d0823b3..c2f2804769d10 100644 --- a/compiler-rt/lib/asan/asan_allocator.cpp +++ b/compiler-rt/lib/asan/asan_allocator.cpp @@ -732,6 +732,25 @@ struct Allocator { return reinterpret_cast(alloc_beg); } + AsanChunk *GetAsanChunkDebug(void *alloc_beg) { + if (!alloc_beg) return nullptr; + if (!allocator.FromPrimary(alloc_beg)) { + uptr *meta = reinterpret_cast(allocator.GetMetaData(alloc_beg)); + AsanChunk *m = reinterpret_cast(meta[1]); + Printf("GetAsanChunkDebug1 alloc_beg %p meta %p m %p\n", alloc_beg, meta, m); + return m; + } + uptr *alloc_magic = reinterpret_cast(alloc_beg); + Printf( + "GetAsanChunkDebug2 alloc_beg %p alloc_magic %p alloc_magic[0] %p " + "alloc_magic[1] %p\n", + alloc_beg, alloc_magic, alloc_magic[0], alloc_magic[1]); + if (alloc_magic[0] == kAllocBegMagic) + return reinterpret_cast(alloc_magic[1]); + return reinterpret_cast(alloc_beg); + } + + AsanChunk *GetAsanChunkByAddr(uptr p) { void *alloc_beg = allocator.GetBlockBegin(reinterpret_cast(p)); return GetAsanChunk(alloc_beg); @@ -744,6 +763,13 @@ struct Allocator { return GetAsanChunk(alloc_beg); } + AsanChunk *GetAsanChunkByAddrFastLockedDebug(uptr p) { + void *alloc_beg = + allocator.GetBlockBeginFastLockedDebug(reinterpret_cast(p)); + Printf("GetAsanChunkByAddrFastLockedDebug p %p alloc_beg %p\n", p, alloc_beg); + return GetAsanChunkDebug(alloc_beg); + } + uptr AllocationSize(uptr p) { AsanChunk *m = GetAsanChunkByAddr(p); if (!m) return 0; @@ -1040,16 +1066,25 @@ uptr PointsIntoChunk(void* p) { // Debug code. Delete once issue #1193 is chased down. extern "C" SANITIZER_WEAK_ATTRIBUTE const char *__lsan_current_stage; +void GetUserBeginDebug(uptr chunk) { + Printf("GetUserBeginDebug1 chunk %p\n", chunk); + __asan::AsanChunk *m = __asan::instance.GetAsanChunkByAddrFastLockedDebug(chunk); + Printf("GetUserBeginDebug2 m %p\n", m); +} + uptr GetUserBegin(uptr chunk) { __asan::AsanChunk *m = __asan::instance.GetAsanChunkByAddrFastLocked(chunk); - if (!m) + if (!m) { Printf( "ASAN is about to crash with a CHECK failure.\n" "The ASAN developers are trying to chase down this bug,\n" "so if you've encountered this bug please let us know.\n" "See also: https://github.com/google/sanitizers/issues/1193\n" + "Internal ref b/149237057\n" "chunk: %p caller %p __lsan_current_stage %s\n", chunk, GET_CALLER_PC(), __lsan_current_stage); + GetUserBeginDebug(chunk); + } CHECK(m); return m->Beg(); } diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp index 105989c93ab10..678f6c1183e01 100644 --- a/compiler-rt/lib/dfsan/dfsan.cpp +++ b/compiler-rt/lib/dfsan/dfsan.cpp @@ -18,15 +18,16 @@ // prefixed __dfsan_. //===----------------------------------------------------------------------===// +#include "dfsan/dfsan.h" + #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_file.h" -#include "sanitizer_common/sanitizer_flags.h" #include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_flags.h" +#include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_libc.h" -#include "dfsan/dfsan.h" - using namespace __dfsan; typedef atomic_uint16_t atomic_dfsan_label; @@ -158,18 +159,10 @@ static void dfsan_check_label(dfsan_label label) { } } -static void ReportUnsupportedFast16(const char *func) { - Report("FATAL: DataFlowSanitizer: %s is unsupported in fast16labels mode\n", - func); - Die(); -} - // Resolves the union of two unequal labels. Nonequality is a precondition for // this function (the instrumentation pass inlines the equality test). extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_label __dfsan_union(dfsan_label l1, dfsan_label l2) { - if (flags().fast16labels) - return l1 | l2; DCHECK_NE(l1, l2); if (l1 == 0) @@ -224,6 +217,14 @@ dfsan_label __dfsan_union_load(const dfsan_label *ls, uptr n) { return label; } +extern "C" SANITIZER_INTERFACE_ATTRIBUTE +dfsan_label __dfsan_union_load_fast16labels(const dfsan_label *ls, uptr n) { + dfsan_label label = ls[0]; + for (uptr i = 1; i != n; ++i) + label |= ls[i]; + return label; +} + extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __dfsan_unimplemented(char *fname) { if (flags().warn_unimplemented) @@ -259,8 +260,6 @@ dfsan_union(dfsan_label l1, dfsan_label l2) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_label dfsan_create_label(const char *desc, void *userdata) { - if (flags().fast16labels) - ReportUnsupportedFast16("dfsan_create_label"); dfsan_label label = atomic_fetch_add(&__dfsan_last_label, 1, memory_order_relaxed) + 1; dfsan_check_label(label); @@ -319,15 +318,11 @@ dfsan_read_label(const void *addr, uptr size) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label) { - if (flags().fast16labels) - ReportUnsupportedFast16("dfsan_get_label_info"); return &__dfsan_label_info[label]; } extern "C" SANITIZER_INTERFACE_ATTRIBUTE int dfsan_has_label(dfsan_label label, dfsan_label elem) { - if (flags().fast16labels) - return label & elem; if (label == elem) return true; const dfsan_label_info *info = dfsan_get_label_info(label); @@ -340,8 +335,6 @@ dfsan_has_label(dfsan_label label, dfsan_label elem) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc) { - if (flags().fast16labels) - ReportUnsupportedFast16("dfsan_has_label_with_desc"); const dfsan_label_info *info = dfsan_get_label_info(label); if (info->l1 != 0) { return dfsan_has_label_with_desc(info->l1, desc) || @@ -361,9 +354,6 @@ dfsan_get_label_count(void) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE void dfsan_dump_labels(int fd) { - if (flags().fast16labels) - return; - dfsan_label last_label = atomic_load(&__dfsan_last_label, memory_order_relaxed); for (uptr l = 1; l <= last_label; ++l) { diff --git a/compiler-rt/lib/dfsan/dfsan_flags.inc b/compiler-rt/lib/dfsan/dfsan_flags.inc index 29db73b98278a..cdd0035c9b2d0 100644 --- a/compiler-rt/lib/dfsan/dfsan_flags.inc +++ b/compiler-rt/lib/dfsan/dfsan_flags.inc @@ -29,7 +29,3 @@ DFSAN_FLAG( DFSAN_FLAG(const char *, dump_labels_at_exit, "", "The path of the file where " "to dump the labels when the " "program terminates.") -DFSAN_FLAG(bool, fast16labels, false, - "Enables experimental mode where DFSan supports only 16 power-of-2 labels " - "(1, 2, 4, 8, ... 32768) and the label union is computed as a bit-wise OR." -) diff --git a/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp b/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp index 48df8e668604c..0e9cdf7e66b15 100644 --- a/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp @@ -253,7 +253,7 @@ int CollectDataFlow(const std::string &DFTBinary, const std::string &DirPath, return 1; } - static char DFSanEnv[] = "DFSAN_OPTIONS=fast16labels=1:warn_unimplemented=0"; + static char DFSanEnv[] = "DFSAN_OPTIONS=warn_unimplemented=0"; putenv(DFSanEnv); MkDir(DirPath); for (auto &F : CorporaFiles) { diff --git a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp index 00a33a413d2f3..8339697396c21 100644 --- a/compiler-rt/lib/fuzzer/FuzzerDriver.cpp +++ b/compiler-rt/lib/fuzzer/FuzzerDriver.cpp @@ -858,6 +858,12 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) { exit(0); // Don't let F destroy itself. } +extern "C" ATTRIBUTE_INTERFACE int +LLVMFuzzerRunDriver(int *argc, char ***argv, + int (*UserCb)(const uint8_t *Data, size_t Size)) { + return FuzzerDriver(argc, argv, UserCb); +} + // Storage for global ExternalFunctions object. ExternalFunctions *EF = nullptr; diff --git a/compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp b/compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp index 8bf4e25b8b15b..78b3f9abd5cf5 100644 --- a/compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp +++ b/compiler-rt/lib/fuzzer/dataflow/DataFlow.cpp @@ -17,9 +17,11 @@ // and also provides basic-block coverage for every input. // // Build: -// 1. Compile this file (DataFlow.cpp) with -fsanitize=dataflow and -O2. +// 1. Compile this file (DataFlow.cpp) with -fsanitize=dataflow -mllvm +// -dfsan-fast-16-labels and -O2. // 2. Compile DataFlowCallbacks.cpp with -O2 -fPIC. // 3. Build the fuzz target with -g -fsanitize=dataflow +// -mllvm -dfsan-fast-16-labels // -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp // 4. Link those together with -fsanitize=dataflow // @@ -36,7 +38,7 @@ // Run: // # Collect data flow and coverage for INPUT_FILE // # write to OUTPUT_FILE (default: stdout) -// export DFSAN_OPTIONS=fast16labels=1:warn_unimplemented=0 +// export DFSAN_OPTIONS=warn_unimplemented=0 // ./a.out INPUT_FILE [OUTPUT_FILE] // // # Print all instrumented functions. llvm-symbolizer must be present in PATH diff --git a/compiler-rt/lib/gwp_asan/common.cpp b/compiler-rt/lib/gwp_asan/common.cpp index 3438c4b91893b..483694d57b7e6 100644 --- a/compiler-rt/lib/gwp_asan/common.cpp +++ b/compiler-rt/lib/gwp_asan/common.cpp @@ -34,6 +34,9 @@ const char *ErrorToString(const Error &E) { __builtin_trap(); } +constexpr size_t AllocationMetadata::kStackFrameStorageBytes; +constexpr size_t AllocationMetadata::kMaxTraceLengthToCollect; + void AllocationMetadata::RecordAllocation(uintptr_t AllocAddr, size_t AllocSize) { Addr = AllocAddr; diff --git a/compiler-rt/lib/gwp_asan/crash_handler.cpp b/compiler-rt/lib/gwp_asan/crash_handler.cpp index c3b9e1467bd9f..3c640256706f5 100644 --- a/compiler-rt/lib/gwp_asan/crash_handler.cpp +++ b/compiler-rt/lib/gwp_asan/crash_handler.cpp @@ -10,6 +10,7 @@ #include "gwp_asan/stack_trace_compressor.h" #include +#include using AllocationMetadata = gwp_asan::AllocationMetadata; using Error = gwp_asan::Error; @@ -112,9 +113,15 @@ uint64_t __gwp_asan_get_allocation_thread_id( size_t __gwp_asan_get_allocation_trace( const gwp_asan::AllocationMetadata *AllocationMeta, uintptr_t *Buffer, size_t BufferLen) { - return gwp_asan::compression::unpack( + uintptr_t UncompressedBuffer[AllocationMetadata::kMaxTraceLengthToCollect]; + size_t UnpackedLength = gwp_asan::compression::unpack( AllocationMeta->AllocationTrace.CompressedTrace, - AllocationMeta->AllocationTrace.TraceSize, Buffer, BufferLen); + AllocationMeta->AllocationTrace.TraceSize, UncompressedBuffer, + AllocationMetadata::kMaxTraceLengthToCollect); + if (UnpackedLength < BufferLen) + BufferLen = UnpackedLength; + memcpy(Buffer, UncompressedBuffer, BufferLen * sizeof(*Buffer)); + return UnpackedLength; } bool __gwp_asan_is_deallocated( @@ -130,9 +137,15 @@ uint64_t __gwp_asan_get_deallocation_thread_id( size_t __gwp_asan_get_deallocation_trace( const gwp_asan::AllocationMetadata *AllocationMeta, uintptr_t *Buffer, size_t BufferLen) { - return gwp_asan::compression::unpack( + uintptr_t UncompressedBuffer[AllocationMetadata::kMaxTraceLengthToCollect]; + size_t UnpackedLength = gwp_asan::compression::unpack( AllocationMeta->DeallocationTrace.CompressedTrace, - AllocationMeta->DeallocationTrace.TraceSize, Buffer, BufferLen); + AllocationMeta->DeallocationTrace.TraceSize, UncompressedBuffer, + AllocationMetadata::kMaxTraceLengthToCollect); + if (UnpackedLength < BufferLen) + BufferLen = UnpackedLength; + memcpy(Buffer, UncompressedBuffer, BufferLen * sizeof(*Buffer)); + return UnpackedLength; } #ifdef __cplusplus diff --git a/compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp b/compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp index 1bd7a606c2136..9a80436efb311 100644 --- a/compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp +++ b/compiler-rt/lib/gwp_asan/optional/segv_handler_posix.cpp @@ -144,7 +144,7 @@ void installSignalHandlers(gwp_asan::GuardedPoolAllocator *GPA, Printf_t Printf, PrintBacktraceForSignalHandler = PrintBacktrace; BacktraceForSignalHandler = SegvBacktrace; - struct sigaction Action; + struct sigaction Action = {}; Action.sa_sigaction = sigSegvHandler; Action.sa_flags = SA_SIGINFO; sigaction(SIGSEGV, &Action, &PreviousHandler); diff --git a/compiler-rt/lib/gwp_asan/tests/backtrace.cpp b/compiler-rt/lib/gwp_asan/tests/backtrace.cpp index b3d44270bb2a1..95150653ff61a 100644 --- a/compiler-rt/lib/gwp_asan/tests/backtrace.cpp +++ b/compiler-rt/lib/gwp_asan/tests/backtrace.cpp @@ -8,6 +8,7 @@ #include +#include "gwp_asan/common.h" #include "gwp_asan/crash_handler.h" #include "gwp_asan/tests/harness.h" @@ -76,9 +77,46 @@ TEST(Backtrace, Short) { TEST(Backtrace, ExceedsStorableLength) { gwp_asan::AllocationMetadata Meta; Meta.AllocationTrace.RecordBacktrace( - [](uintptr_t * /* TraceBuffer */, size_t /* Size */) -> size_t { - return SIZE_MAX; // Wow, that's big! + [](uintptr_t *TraceBuffer, size_t Size) -> size_t { + // Need to inintialise the elements that will be packed. + memset(TraceBuffer, 0u, Size * sizeof(*TraceBuffer)); + + // Indicate that there were more frames, and we just didn't have enough + // room to store them. + return Size * 2; + }); + // Retrieve a frame from the collected backtrace, make sure it works E2E. + uintptr_t TraceOutput; + EXPECT_EQ(gwp_asan::AllocationMetadata::kMaxTraceLengthToCollect, + __gwp_asan_get_allocation_trace(&Meta, &TraceOutput, 1)); +} + +TEST(Backtrace, ExceedsRetrievableAllocLength) { + gwp_asan::AllocationMetadata Meta; + constexpr size_t kNumFramesToStore = 3u; + Meta.AllocationTrace.RecordBacktrace( + [](uintptr_t *TraceBuffer, size_t /* Size */) -> size_t { + memset(TraceBuffer, kNumFramesToStore, + kNumFramesToStore * sizeof(*TraceBuffer)); + return kNumFramesToStore; + }); + uintptr_t TraceOutput; + // Ask for one element, get told that there's `kNumFramesToStore` available. + EXPECT_EQ(kNumFramesToStore, + __gwp_asan_get_allocation_trace(&Meta, &TraceOutput, 1)); +} + +TEST(Backtrace, ExceedsRetrievableDeallocLength) { + gwp_asan::AllocationMetadata Meta; + constexpr size_t kNumFramesToStore = 3u; + Meta.DeallocationTrace.RecordBacktrace( + [](uintptr_t *TraceBuffer, size_t /* Size */) -> size_t { + memset(TraceBuffer, kNumFramesToStore, + kNumFramesToStore * sizeof(*TraceBuffer)); + return kNumFramesToStore; }); uintptr_t TraceOutput; - EXPECT_EQ(1u, __gwp_asan_get_allocation_trace(&Meta, &TraceOutput, 1)); + // Ask for one element, get told that there's `kNumFramesToStore` available. + EXPECT_EQ(kNumFramesToStore, + __gwp_asan_get_deallocation_trace(&Meta, &TraceOutput, 1)); } diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp index d61e9dee30654..bfdae0b920c9b 100644 --- a/compiler-rt/lib/msan/msan_linux.cpp +++ b/compiler-rt/lib/msan/msan_linux.cpp @@ -142,7 +142,7 @@ bool InitShadow(bool init_origins) { if (map) { if (!CheckMemoryRangeAvailability(start, size)) return false; - if (!MmapFixedNoReserve(start, size, kMemoryLayout[i].name)) + if (!MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name)) return false; if (common_flags()->use_madv_dontdump) DontDumpShadowMemory(start, size); diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 5ff0e10182b4d..3457bb6444de6 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -1,11 +1,11 @@ CHECK_CXX_SOURCE_COMPILES(" -#ifdef _MSC_VER -#include /* Workaround for PR19898. */ +#ifdef _WIN32 +#include /* Workaround for PR19898. */ #include #endif int main() { -#ifdef _MSC_VER +#ifdef _WIN32 volatile LONG val = 1; MemoryBarrier(); InterlockedCompareExchange(&val, 0, 1); @@ -113,6 +113,8 @@ endif() # We don't use the C++ Standard Library here, so avoid including it by mistake. append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS) +# Remove -stdlib= which is unused when passing -nostdinc++. +string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) # This appears to be a C-only warning banning the use of locals in aggregate # initializers. All other compilers accept this, though. diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c index 92ad25f62cd1b..6df65f66df732 100644 --- a/compiler-rt/lib/profile/InstrProfiling.c +++ b/compiler-rt/lib/profile/InstrProfiling.c @@ -6,6 +6,9 @@ |* \*===----------------------------------------------------------------------===*/ +// Note: This is linked into the Darwin kernel, and must remain compatible +// with freestanding compilation. See `darwin_add_builtin_libraries`. + #include #include #include @@ -17,7 +20,6 @@ #define INSTR_PROF_VALUE_PROF_DATA #include "profile/InstrProfData.inc" - COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_magic(void) { return sizeof(void *) == sizeof(uint64_t) ? (INSTR_PROF_RAW_MAGIC_64) : (INSTR_PROF_RAW_MAGIC_32); diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index d7a7c32332c1b..7d1c77a3fab3e 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -54,6 +54,15 @@ int __llvm_profile_is_continuous_mode_enabled(void); */ void __llvm_profile_enable_continuous_mode(void); +/*! + * \brief Set the page size. + * + * This is a pre-requisite for enabling continuous mode. The buffer size + * calculation code inside of libprofile cannot simply call getpagesize(), as + * it is not allowed to depend on libc. + */ +void __llvm_profile_set_page_size(unsigned PageSize); + /*! * \brief Get number of bytes necessary to pad the argument to eight * byte boundary. diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c index 5ee44785a7ab9..07bb4d4e4f1b4 100644 --- a/compiler-rt/lib/profile/InstrProfilingBuffer.c +++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c @@ -6,6 +6,9 @@ |* \*===----------------------------------------------------------------------===*/ +// Note: This is linked into the Darwin kernel, and must remain compatible +// with freestanding compilation. See `darwin_add_builtin_libraries`. + #include "InstrProfiling.h" #include "InstrProfilingInternal.h" #include "InstrProfilingPort.h" @@ -18,14 +21,22 @@ * layering is violated. */ static int ContinuouslySyncProfile = 0; +/* The system page size. Only valid when non-zero. If 0, the page size is + * unavailable. */ +static unsigned PageSize = 0; + COMPILER_RT_VISIBILITY int __llvm_profile_is_continuous_mode_enabled(void) { - return ContinuouslySyncProfile; + return ContinuouslySyncProfile && PageSize; } COMPILER_RT_VISIBILITY void __llvm_profile_enable_continuous_mode(void) { ContinuouslySyncProfile = 1; } +COMPILER_RT_VISIBILITY void __llvm_profile_set_page_size(unsigned PS) { + PageSize = PS; +} + COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_size_for_buffer(void) { const __llvm_profile_data *DataBegin = __llvm_profile_begin_data(); @@ -49,8 +60,7 @@ uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin, /// Calculate the number of padding bytes needed to add to \p Offset in order /// for (\p Offset + Padding) to be page-aligned. -static uint64_t calculateBytesNeededToPageAlign(uint64_t Offset, - unsigned PageSize) { +static uint64_t calculateBytesNeededToPageAlign(uint64_t Offset) { uint64_t OffsetModPage = Offset % PageSize; if (OffsetModPage > 0) return PageSize - OffsetModPage; @@ -72,15 +82,13 @@ void __llvm_profile_get_padding_sizes_for_counters( // In continuous mode, the file offsets for headers and for the start of // counter sections need to be page-aligned. - unsigned PageSize = getpagesize(); uint64_t DataSizeInBytes = DataSize * sizeof(__llvm_profile_data); uint64_t CountersSizeInBytes = CountersSize * sizeof(uint64_t); *PaddingBytesBeforeCounters = calculateBytesNeededToPageAlign( - sizeof(__llvm_profile_header) + DataSizeInBytes, PageSize); + sizeof(__llvm_profile_header) + DataSizeInBytes); *PaddingBytesAfterCounters = - calculateBytesNeededToPageAlign(CountersSizeInBytes, PageSize); - *PaddingBytesAfterNames = - calculateBytesNeededToPageAlign(NamesSize, PageSize); + calculateBytesNeededToPageAlign(CountersSizeInBytes); + *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize); } COMPILER_RT_VISIBILITY diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 9e1a54a0c3737..8c7bb0c25de16 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -751,6 +751,7 @@ static int parseFilenamePattern(const char *FilenamePat, return -1; } + __llvm_profile_set_page_size(getpagesize()); __llvm_profile_enable_continuous_mode(); I++; /* advance to 'c' */ } else { diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.c b/compiler-rt/lib/profile/InstrProfilingInternal.c index d58bc19ad11e6..6a54697df7f01 100644 --- a/compiler-rt/lib/profile/InstrProfilingInternal.c +++ b/compiler-rt/lib/profile/InstrProfilingInternal.c @@ -6,6 +6,9 @@ |* \*===----------------------------------------------------------------------===*/ +// Note: This is linked into the Darwin kernel, and must remain compatible +// with freestanding compilation. See `darwin_add_builtin_libraries`. + #if !defined(__Fuchsia__) #include "InstrProfilingInternal.h" diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c index 23bdb7f37179d..29541c74d5a63 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c @@ -6,6 +6,9 @@ |* \*===----------------------------------------------------------------------===*/ +// Note: This is linked into the Darwin kernel, and must remain compatible +// with freestanding compilation. See `darwin_add_builtin_libraries`. + #include "InstrProfiling.h" #if defined(__APPLE__) diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c index becfe1fd9f5ad..c9fb481f8e90c 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c @@ -43,7 +43,7 @@ uint64_t __prof_cnts_sect_data[0] COMPILER_RT_SECTION(INSTR_PROF_CNTS_SECT_NAME); uint32_t __prof_orderfile_sect_data[0] COMPILER_RT_SECTION(INSTR_PROF_ORDERFILE_SECT_NAME); -char __prof_nms_sect_data[0] COMPILER_RT_SECTION(INSTR_PROF_NAME_SECT_NAME); +const char __prof_nms_sect_data[0] COMPILER_RT_SECTION(INSTR_PROF_NAME_SECT_NAME); ValueProfNode __prof_vnodes_sect_data[0] COMPILER_RT_SECTION(INSTR_PROF_VNODES_SECT_NAME); COMPILER_RT_VISIBILITY const __llvm_profile_data * diff --git a/compiler-rt/lib/profile/InstrProfilingPort.h b/compiler-rt/lib/profile/InstrProfilingPort.h index 20cf5d660c6ae..4493dd512ff0d 100644 --- a/compiler-rt/lib/profile/InstrProfilingPort.h +++ b/compiler-rt/lib/profile/InstrProfilingPort.h @@ -53,9 +53,9 @@ #endif #if COMPILER_RT_HAS_ATOMICS == 1 -#ifdef _MSC_VER +#ifdef _WIN32 #include -#if _MSC_VER < 1900 +#if defined(_MSC_VER) && _MSC_VER < 1900 #define snprintf _snprintf #endif #if defined(_WIN64) @@ -73,7 +73,7 @@ (DomType *)InterlockedExchangeAdd((LONG volatile *)&PtrVar, \ (LONG)sizeof(DomType) * PtrIncr) #endif -#else /* !defined(_MSC_VER) */ +#else /* !defined(_WIN32) */ #define COMPILER_RT_BOOL_CMPXCHG(Ptr, OldV, NewV) \ __sync_bool_compare_and_swap(Ptr, OldV, NewV) #define COMPILER_RT_PTR_FETCH_ADD(DomType, PtrVar, PtrIncr) \ diff --git a/compiler-rt/lib/profile/InstrProfilingValue.c b/compiler-rt/lib/profile/InstrProfilingValue.c index 29b9e628a9c95..f6b28b48e8426 100644 --- a/compiler-rt/lib/profile/InstrProfilingValue.c +++ b/compiler-rt/lib/profile/InstrProfilingValue.c @@ -6,6 +6,7 @@ |* \*===----------------------------------------------------------------------===*/ +#include #include #include #include @@ -17,13 +18,14 @@ #define INSTR_PROF_VALUE_PROF_DATA #define INSTR_PROF_COMMON_API_IMPL +#define INSTR_PROF_VALUE_PROF_MEMOP_API #include "profile/InstrProfData.inc" static int hasStaticCounters = 1; static int OutOfNodesWarnings = 0; static int hasNonDefaultValsPerSite = 0; #define INSTR_PROF_MAX_VP_WARNS 10 -#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 16 +#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 24 #define INSTR_PROF_VNODE_POOL_SIZE 1024 #ifndef _MSC_VER @@ -93,8 +95,8 @@ static int allocateValueProfileCounters(__llvm_profile_data *Data) { for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI) NumVSites += Data->NumValueSites[VKI]; - if (NumVSites == 0) - return 0; + // If NumVSites = 0, calloc is allowed to return a non-null pointer. + assert(NumVSites > 0 && "NumVSites can't be zero"); ValueProfNode **Mem = (ValueProfNode **)calloc(NumVSites, sizeof(ValueProfNode *)); if (!Mem) @@ -252,6 +254,8 @@ __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data, * The range for large values is optional. The default value of INT64_MIN * indicates it is not specified. */ +/* FIXME: This is to be removed after switching to the new memop value + * profiling. */ COMPILER_RT_VISIBILITY void __llvm_profile_instrument_range( uint64_t TargetValue, void *Data, uint32_t CounterIndex, int64_t PreciseRangeStart, int64_t PreciseRangeLast, int64_t LargeValue) { @@ -265,6 +269,18 @@ COMPILER_RT_VISIBILITY void __llvm_profile_instrument_range( __llvm_profile_instrument_target(TargetValue, Data, CounterIndex); } +/* + * The target values are partitioned into multiple ranges. The range spec is + * defined in InstrProfData.inc. + */ +COMPILER_RT_VISIBILITY void +__llvm_profile_instrument_memop(uint64_t TargetValue, void *Data, + uint32_t CounterIndex) { + // Map the target value to the representative value of its range. + uint64_t RepValue = InstrProfGetRangeRepValue(TargetValue); + __llvm_profile_instrument_target(RepValue, Data, CounterIndex); +} + /* * A wrapper struct that represents value profile runtime data. * Like InstrProfRecord class which is used by profiling host tools, diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c index c34e110a69590..16ad965ff6083 100644 --- a/compiler-rt/lib/profile/InstrProfilingWriter.c +++ b/compiler-rt/lib/profile/InstrProfilingWriter.c @@ -6,6 +6,9 @@ |* \*===----------------------------------------------------------------------===*/ +// Note: This is linked into the Darwin kernel, and must remain compatible +// with freestanding compilation. See `darwin_add_builtin_libraries`. + #ifdef _MSC_VER /* For _alloca */ #include diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_combined.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_combined.h index 33f89d6d49928..6d73784d77d09 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_combined.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_combined.h @@ -142,6 +142,13 @@ class CombinedAllocator { return secondary_.GetBlockBeginFastLocked(p); } + void *GetBlockBeginFastLockedDebug(void *p) { + if (primary_.PointerIsMine(p)) + return primary_.GetBlockBeginDebug(p); + return secondary_.GetBlockBeginFastLocked(p); + } + + uptr GetActuallyAllocatedSize(void *p) { if (primary_.PointerIsMine(p)) return primary_.GetActuallyAllocatedSize(p); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h index 3b1838b3985ac..47cc42cb411c2 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h @@ -210,6 +210,7 @@ class SizeClassAllocator32 { uptr res = beg + (n * (u32)size); return reinterpret_cast(res); } + void *GetBlockBeginDebug(const void *p) { return GetBlockBegin(p); } uptr GetActuallyAllocatedSize(void *p) { CHECK(PointerIsMine(p)); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h index 1d9a29c70f308..acc61cc6ba8dd 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary64.h @@ -199,6 +199,29 @@ class SizeClassAllocator64 { return nullptr; } + void *GetBlockBeginDebug(const void *p) { + uptr class_id = GetSizeClass(p); + uptr size = ClassIdToSize(class_id); + Printf("GetBlockBeginDebug1 p %p class_id %p size %p\n", p, class_id, size); + if (!size) return nullptr; + uptr chunk_idx = GetChunkIdx((uptr)p, size); + uptr reg_beg = GetRegionBegin(p); + uptr beg = chunk_idx * size; + uptr next_beg = beg + size; + Printf( + "GetBlockBeginDebug2 chunk_idx %p reg_beg %p beg %p next_beg %p " + "kNumClasses %p\n", + chunk_idx, reg_beg, beg, next_beg, kNumClasses); + if (class_id >= kNumClasses) return nullptr; + const RegionInfo *region = AddressSpaceView::Load(GetRegionInfo(class_id)); + Printf("GetBlockBeginDebug3 region %p region->mapped_user %p\n", region, + region->mapped_user); + if (region->mapped_user >= next_beg) + return reinterpret_cast(reg_beg + beg); + return nullptr; + } + + uptr GetActuallyAllocatedSize(void *p) { CHECK(PointerIsMine(p)); return ClassIdToSize(GetSizeClass(p)); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h b/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h index 86ad3a5e2c2aa..38439e44f611e 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h @@ -21,8 +21,9 @@ #if SANITIZER_LINUX || SANITIZER_FUCHSIA -# if __GLIBC_PREREQ(2, 16) || (SANITIZER_ANDROID && __ANDROID_API__ >= 21) || \ - SANITIZER_FUCHSIA +# if (__GLIBC_PREREQ(2, 16) || (SANITIZER_ANDROID && __ANDROID_API__ >= 21) || \ + SANITIZER_FUCHSIA) && \ + !SANITIZER_GO # define SANITIZER_USE_GETAUXVAL 1 # else # define SANITIZER_USE_GETAUXVAL 0 diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index db8a09e6f0de3..21a9c01bf2a9a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -606,21 +606,84 @@ HandleSignalMode GetHandleSignalMode(int signum) { return result; } -// This corresponds to Triple::getMacOSXVersion() in the Clang driver. -static MacosVersion GetMacosAlignedVersionInternal() { - u16 kernel_major = GetDarwinKernelVersion().major; - // Darwin 0-3 -> unsupported - // Darwin 4-19 -> macOS 10.x - // Darwin 20+ -> macOS 11+ - CHECK_GE(kernel_major, 4); - u16 major, minor; - if (kernel_major < 20) { - major = 10; - minor = kernel_major - 4; +// Offset example: +// XNU 17 -- macOS 10.13 -- iOS 11 -- tvOS 11 -- watchOS 4 +constexpr u16 GetOSMajorKernelOffset() { + if (TARGET_OS_OSX) return 4; + if (TARGET_OS_IOS || TARGET_OS_TV) return 6; + if (TARGET_OS_WATCH) return 13; +} + +using VersStr = char[64]; + +static void GetOSVersion(VersStr vers) { + uptr len = sizeof(VersStr); + if (SANITIZER_IOSSIM) { + const char *vers_env = GetEnv("SIMULATOR_RUNTIME_VERSION"); + if (!vers_env) { + Report("ERROR: Running in simulator but SIMULATOR_RUNTIME_VERSION env " + "var is not set.\n"); + Die(); + } + len = internal_strlcpy(vers, vers_env, len); } else { - major = 11 + kernel_major - 20; - minor = 0; + int res = + internal_sysctlbyname("kern.osproductversion", vers, &len, nullptr, 0); + if (res) { + // Fallback for XNU 17 (macOS 10.13) and below that do not provide the + // `kern.osproductversion` property. + u16 kernel_major = GetDarwinKernelVersion().major; + u16 offset = GetOSMajorKernelOffset(); + CHECK_LE(kernel_major, 17); + CHECK_GE(kernel_major, offset); + u16 os_major = kernel_major - offset; + + auto format = TARGET_OS_OSX ? "10.%d" : "%d.0"; + len = internal_snprintf(vers, len, format, os_major); + } + } + CHECK_LT(len, sizeof(VersStr)); +} + +void ParseVersion(const char *vers, u16 *major, u16 *minor) { + // Format: .[.]\0 + CHECK_GE(internal_strlen(vers), 3); + const char *p = vers; + *major = internal_simple_strtoll(p, &p, /*base=*/10); + CHECK_EQ(*p, '.'); + p += 1; + *minor = internal_simple_strtoll(p, &p, /*base=*/10); +} + +// Aligned versions example: +// macOS 10.15 -- iOS 13 -- tvOS 13 -- watchOS 6 +static void MapToMacos(u16 *major, u16 *minor) { + if (TARGET_OS_OSX) + return; + + if (TARGET_OS_IOS || TARGET_OS_TV) + *major += 2; + else if (TARGET_OS_WATCH) + *major += 9; + else + UNREACHABLE("unsupported platform"); + + if (*major >= 16) { // macOS 11+ + *major -= 5; + } else { // macOS 10.15 and below + *minor = *major; + *major = 10; } +} + +static MacosVersion GetMacosAlignedVersionInternal() { + VersStr vers; + GetOSVersion(vers); + + u16 major, minor; + ParseVersion(vers, &major, &minor); + MapToMacos(&major, &minor); + return MacosVersion(major, minor); } @@ -639,24 +702,15 @@ MacosVersion GetMacosAlignedVersion() { return *reinterpret_cast(&result); } -void ParseVersion(const char *vers, u16 *major, u16 *minor) { - // Format: ..\0 - CHECK_GE(internal_strlen(vers), 5); - const char *p = vers; - *major = internal_simple_strtoll(p, &p, /*base=*/10); - CHECK_EQ(*p, '.'); - p += 1; - *minor = internal_simple_strtoll(p, &p, /*base=*/10); -} - DarwinKernelVersion GetDarwinKernelVersion() { - char buf[100]; - size_t len = sizeof(buf); - int res = internal_sysctlbyname("kern.osrelease", buf, &len, nullptr, 0); + VersStr vers; + uptr len = sizeof(VersStr); + int res = internal_sysctlbyname("kern.osrelease", vers, &len, nullptr, 0); CHECK_EQ(res, 0); + CHECK_LT(len, sizeof(VersStr)); u16 major, minor; - ParseVersion(buf, &major, &minor); + ParseVersion(vers, &major, &minor); return DarwinKernelVersion(major, minor); } @@ -796,6 +850,19 @@ void SignalContext::InitPcSpBp() { GetPcSpBp(context, &pc, &sp, &bp); } +// ASan/TSan use mmap in a way that creates “deallocation gaps” which triggers +// EXC_GUARD exceptions on macOS 10.15+ (XNU 19.0+). +static void DisableMmapExcGuardExceptions() { + using task_exc_guard_behavior_t = uint32_t; + using task_set_exc_guard_behavior_t = + kern_return_t(task_t task, task_exc_guard_behavior_t behavior); + auto *set_behavior = (task_set_exc_guard_behavior_t *)dlsym( + RTLD_DEFAULT, "task_set_exc_guard_behavior"); + if (set_behavior == nullptr) return; + const task_exc_guard_behavior_t task_exc_guard_none = 0; + set_behavior(mach_task_self(), task_exc_guard_none); +} + void InitializePlatformEarly() { // Only use xnu_fast_mmap when on x86_64 and the kernel supports it. use_xnu_fast_mmap = @@ -804,6 +871,8 @@ void InitializePlatformEarly() { #else false; #endif + if (GetDarwinKernelVersion() >= DarwinKernelVersion(19, 0)) + DisableMmapExcGuardExceptions(); } #if !SANITIZER_GO @@ -844,20 +913,10 @@ bool ReexecDisabled() { return false; } -extern "C" SANITIZER_WEAK_ATTRIBUTE double dyldVersionNumber; -static const double kMinDyldVersionWithAutoInterposition = 360.0; - -bool DyldNeedsEnvVariable() { - // Although sanitizer support was added to LLVM on OS X 10.7+, GCC users - // still may want use them on older systems. On older Darwin platforms, dyld - // doesn't export dyldVersionNumber symbol and we simply return true. - if (!&dyldVersionNumber) return true; +static bool DyldNeedsEnvVariable() { // If running on OS X 10.11+ or iOS 9.0+, dyld will interpose even if - // DYLD_INSERT_LIBRARIES is not set. However, checking OS version via - // GetMacosAlignedVersion() doesn't work for the simulator. Let's instead - // check `dyldVersionNumber`, which is exported by dyld, against a known - // version number from the first OS release where this appeared. - return dyldVersionNumber < kMinDyldVersionWithAutoInterposition; + // DYLD_INSERT_LIBRARIES is not set. + return GetMacosAlignedVersion() < MacosVersion(10, 11); } void MaybeReexec() { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h index 90ecff4815c2e..f61ebe2566e5f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.h @@ -44,6 +44,7 @@ struct VersionBase { return major > other.major || (major == other.major && minor >= other.minor); } + bool operator<(const VersionType &other) const { return !(*this >= other); } }; struct MacosVersion : VersionBase { diff --git a/compiler-rt/lib/sanitizer_common/scripts/litlint.py b/compiler-rt/lib/sanitizer_common/scripts/litlint.py index 81b89c2144381..c443efb915a9c 100755 --- a/compiler-rt/lib/sanitizer_common/scripts/litlint.py +++ b/compiler-rt/lib/sanitizer_common/scripts/litlint.py @@ -45,7 +45,7 @@ def LintFile(p): The number of errors detected. """ errs = 0 - with open(p, 'r') as f: + with open(p, 'r', encoding='utf-8') as f: for i, s in enumerate(f.readlines(), start=1): msg, col = LintLine(s) if msg != None: diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_mac_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_mac_test.cpp index c8658ea55d034..090947eceb4a7 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_mac_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_mac_test.cpp @@ -22,6 +22,38 @@ namespace __sanitizer { +void ParseVersion(const char *vers, u16 *major, u16 *minor); + +TEST(SanitizerMac, ParseVersion) { + u16 major, minor; + + ParseVersion("11.22.33", &major, &minor); + EXPECT_EQ(major, 11); + EXPECT_EQ(minor, 22); + + ParseVersion("1.2", &major, &minor); + EXPECT_EQ(major, 1); + EXPECT_EQ(minor, 2); +} + +// TODO(yln): Run sanitizer unit tests for the simulators (rdar://65680742) +#if SANITIZER_IOSSIM +TEST(SanitizerMac, GetMacosAlignedVersion) { + const char *vers_str; + if (SANITIZER_IOS || SANITIZER_TVOS) { + vers_str = "13.0"; + } else if (SANITIZER_WATCHOS) { + vers_str = "6.5"; + } else { + FAIL() << "unsupported simulator runtime"; + } + setenv("SIMULATOR_RUNTIME_VERSION", vers_str, /*overwrite=*/1); + + MacosVersion vers = GetMacosAlignedVersion(); + EXPECT_EQ(vers.major, 10); + EXPECT_EQ(vers.minor, 15); +} +#else TEST(SanitizerMac, GetMacosAlignedVersion) { MacosVersion vers = GetMacosAlignedVersion(); u16 kernel_major = GetDarwinKernelVersion().major; @@ -31,15 +63,7 @@ TEST(SanitizerMac, GetMacosAlignedVersion) { EXPECT_EQ(vers.major, expected_major); EXPECT_EQ(vers.minor, expected_minor); } - -void ParseVersion(const char *vers, u16 *major, u16 *minor); - -TEST(SanitizerMac, ParseVersion) { - u16 major, minor; - ParseVersion("11.22.33", &major, &minor); - EXPECT_EQ(major, 11); - EXPECT_EQ(minor, 22); -} +#endif TEST(SanitizerMac, GetDarwinKernelVersion) { DarwinKernelVersion vers = GetDarwinKernelVersion(); diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt index bdaeb569efdd1..dfae6dde5e4d0 100644 --- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt +++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt @@ -19,6 +19,9 @@ append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding SCUDO_CFLAGS) append_list_if(COMPILER_RT_HAS_FVISIBILITY_HIDDEN_FLAG -fvisibility=hidden SCUDO_CFLAGS) +# FIXME: find cleaner way to agree with GWPAsan flags +append_list_if(COMPILER_RT_HAS_FNO_LTO_FLAG -fno-lto SCUDO_CFLAGS) + if (COMPILER_RT_HAS_GWP_ASAN) append_list_if(COMPILER_RT_HAS_OMIT_FRAME_POINTER_FLAG -fno-omit-frame-pointer SCUDO_CFLAGS) diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h index ad2a17ef7014a..cf362da4e5be5 100644 --- a/compiler-rt/lib/scudo/standalone/allocator_config.h +++ b/compiler-rt/lib/scudo/standalone/allocator_config.h @@ -48,9 +48,10 @@ struct AndroidConfig { typedef SizeClassAllocator32 Primary; #endif // Cache blocks up to 2MB - typedef MapAllocator> Secondary; + typedef MapAllocator> + Secondary; template - using TSDRegistryT = TSDRegistrySharedT; // Shared, max 2 TSDs. + using TSDRegistryT = TSDRegistrySharedT; // Shared, max 8 TSDs. }; struct AndroidSvelteConfig { @@ -62,9 +63,9 @@ struct AndroidSvelteConfig { // 64KB regions typedef SizeClassAllocator32 Primary; #endif - typedef MapAllocator> Secondary; + typedef MapAllocator> Secondary; template - using TSDRegistryT = TSDRegistrySharedT; // Shared, only 1 TSD. + using TSDRegistryT = TSDRegistrySharedT; // Shared, max 2 TSDs. }; #if SCUDO_CAN_USE_PRIMARY64 @@ -73,7 +74,7 @@ struct FuchsiaConfig { typedef SizeClassAllocator64 Primary; typedef MapAllocator Secondary; template - using TSDRegistryT = TSDRegistrySharedT; // Shared, max 8 TSDs. + using TSDRegistryT = TSDRegistrySharedT; // Shared, max 8 TSDs. }; #endif diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 582178ee36b40..6ca00c29ab732 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -41,8 +41,6 @@ extern "C" size_t android_unsafe_frame_pointer_chase(scudo::uptr *buf, namespace scudo { -enum class Option { ReleaseInterval, MemtagTuning }; - template class Allocator { public: @@ -277,7 +275,7 @@ class Allocator { } #endif // GWP_ASAN_HOOKS - FillContentsMode FillContents = + const FillContentsMode FillContents = ZeroContents ? ZeroFill : Options.FillContents; if (UNLIKELY(Alignment > MaxAlignment)) { @@ -285,7 +283,7 @@ class Allocator { return nullptr; reportAlignmentTooBig(Alignment, MaxAlignment); } - if (Alignment < MinAlignment) + if (UNLIKELY(Alignment < MinAlignment)) Alignment = MinAlignment; // If the requested size happens to be 0 (more common than you might think), @@ -322,13 +320,11 @@ class Allocator { if (UNLIKELY(!Block)) { while (ClassId < SizeClassMap::LargestClassId) { Block = TSD->Cache.allocate(++ClassId); - if (LIKELY(Block)) { + if (LIKELY(Block)) break; - } } - if (UNLIKELY(!Block)) { + if (UNLIKELY(!Block)) ClassId = 0; - } } if (UnlockRequired) TSD->unlock(); @@ -349,7 +345,7 @@ class Allocator { void *Ptr = reinterpret_cast(UserPtr); void *TaggedPtr = Ptr; - if (ClassId) { + if (LIKELY(ClassId)) { // We only need to zero or tag the contents for Primary backed // allocations. We only set tags for primary allocations in order to avoid // faulting potentially large numbers of pages for large secondary @@ -692,11 +688,7 @@ class Allocator { } bool setOption(Option O, sptr Value) { - if (O == Option::ReleaseInterval) { - Primary.setReleaseToOsIntervalMs(static_cast(Value)); - Secondary.setReleaseToOsIntervalMs(static_cast(Value)); - return true; - } + initThreadMaybe(); if (O == Option::MemtagTuning) { // Enabling odd/even tags involves a tradeoff between use-after-free // detection and buffer overflow detection. Odd/even tags make it more @@ -705,14 +697,19 @@ class Allocator { // use-after-free is less likely to be detected because the tag space for // any particular chunk is cut in half. Therefore we use this tuning // setting to control whether odd/even tags are enabled. - if (Value == M_MEMTAG_TUNING_BUFFER_OVERFLOW) { + if (Value == M_MEMTAG_TUNING_BUFFER_OVERFLOW) Options.UseOddEvenTags = true; - return true; - } - if (Value == M_MEMTAG_TUNING_UAF) { + else if (Value == M_MEMTAG_TUNING_UAF) Options.UseOddEvenTags = false; - return true; - } + return true; + } else { + // We leave it to the various sub-components to decide whether or not they + // want to handle the option, but we do not want to short-circuit + // execution if one of the setOption was to return false. + const bool PrimaryResult = Primary.setOption(O, Value); + const bool SecondaryResult = Secondary.setOption(O, Value); + const bool RegistryResult = TSDRegistry.setOption(O, Value); + return PrimaryResult && SecondaryResult && RegistryResult; } return false; } @@ -805,8 +802,7 @@ class Allocator { PrimaryT::findNearestBlock(RegionInfoPtr, UntaggedFaultAddr); auto GetGranule = [&](uptr Addr, const char **Data, uint8_t *Tag) -> bool { - if (Addr < MemoryAddr || - Addr + archMemoryTagGranuleSize() < Addr || + if (Addr < MemoryAddr || Addr + archMemoryTagGranuleSize() < Addr || Addr + archMemoryTagGranuleSize() > MemoryAddr + MemorySize) return false; *Data = &Memory[Addr - MemoryAddr]; @@ -950,10 +946,10 @@ class Allocator { u32 Cookie; struct { - u8 MayReturnNull : 1; // may_return_null + u8 MayReturnNull : 1; // may_return_null FillContentsMode FillContents : 2; // zero_contents, pattern_fill_contents - u8 DeallocTypeMismatch : 1; // dealloc_type_mismatch - u8 DeleteSizeMismatch : 1; // delete_size_mismatch + u8 DeallocTypeMismatch : 1; // dealloc_type_mismatch + u8 DeleteSizeMismatch : 1; // delete_size_mismatch u8 TrackAllocationStacks : 1; u8 UseOddEvenTags : 1; u32 QuarantineMaxChunkSize; // quarantine_max_chunk_size diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h index 9037f92b4976c..b3bce6ee291aa 100644 --- a/compiler-rt/lib/scudo/standalone/common.h +++ b/compiler-rt/lib/scudo/standalone/common.h @@ -182,6 +182,14 @@ struct BlockInfo { uptr RegionEnd; }; +enum class Option : u8 { + ReleaseInterval, // Release to OS interval in milliseconds. + MemtagTuning, // Whether to tune tagging for UAF or overflow. + MaxCacheEntriesCount, // Maximum number of blocks that can be cached. + MaxCacheEntrySize, // Maximum size of a block that can be cached. + MaxTSDsCount, // Number of usable TSDs for the shared registry. +}; + constexpr unsigned char PatternFillByte = 0xAB; enum FillContentsMode { diff --git a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h index 27c1684596c44..7e65b68ab36d0 100644 --- a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h +++ b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h @@ -121,6 +121,18 @@ size_t __scudo_get_region_info_size(); #define M_MEMTAG_TUNING -102 #endif +#ifndef M_CACHE_COUNT_MAX +#define M_CACHE_COUNT_MAX -200 +#endif + +#ifndef M_CACHE_SIZE_MAX +#define M_CACHE_SIZE_MAX -201 +#endif + +#ifndef M_TSDS_COUNT_MAX +#define M_TSDS_COUNT_MAX -202 +#endif + enum scudo_memtag_tuning { // Tune for buffer overflows. M_MEMTAG_TUNING_BUFFER_OVERFLOW, diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h index 2ee0f6c600ab2..e41b949d8d0e6 100644 --- a/compiler-rt/lib/scudo/standalone/primary32.h +++ b/compiler-rt/lib/scudo/standalone/primary32.h @@ -86,7 +86,7 @@ class SizeClassAllocator32 { if (Sci->CanRelease) Sci->ReleaseInfo.LastReleaseAtNs = Time; } - setReleaseToOsIntervalMs(ReleaseToOsInterval); + setOption(Option::ReleaseInterval, static_cast(ReleaseToOsInterval)); } void init(s32 ReleaseToOsInterval) { memset(this, 0, sizeof(*this)); @@ -184,13 +184,16 @@ class SizeClassAllocator32 { getStats(Str, I, 0); } - void setReleaseToOsIntervalMs(s32 Interval) { - if (Interval >= MaxReleaseToOsIntervalMs) { - Interval = MaxReleaseToOsIntervalMs; - } else if (Interval <= MinReleaseToOsIntervalMs) { - Interval = MinReleaseToOsIntervalMs; + bool setOption(Option O, sptr Value) { + if (O == Option::ReleaseInterval) { + const s32 Interval = + Max(Min(static_cast(Value), MaxReleaseToOsIntervalMs), + MinReleaseToOsIntervalMs); + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + return true; } - atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + // Not supported by the Primary, but not an error either. + return true; } uptr releaseToOS() { @@ -423,10 +426,6 @@ class SizeClassAllocator32 { AvailableChunks, Rss >> 10, Sci->ReleaseInfo.RangesReleased); } - s32 getReleaseToOsIntervalMs() { - return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); - } - NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId, bool Force = false) { const uptr BlockSize = getSizeByClassId(ClassId); @@ -457,7 +456,8 @@ class SizeClassAllocator32 { } if (!Force) { - const s32 IntervalMs = getReleaseToOsIntervalMs(); + const s32 IntervalMs = + atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); if (IntervalMs < 0) return 0; if (Sci->ReleaseInfo.LastReleaseAtNs + @@ -467,28 +467,33 @@ class SizeClassAllocator32 { } } - // TODO(kostyak): currently not ideal as we loop over all regions and - // iterate multiple times over the same freelist if a ClassId spans multiple - // regions. But it will have to do for now. - uptr TotalReleasedBytes = 0; - const uptr MaxSize = (RegionSize / BlockSize) * BlockSize; + DCHECK_GT(MinRegionIndex, 0U); + uptr First = 0; for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) { if (PossibleRegions[I] - 1U == ClassId) { - const uptr Region = I * RegionSize; - // If the region is the one currently associated to the size-class, we - // only need to release up to CurrentRegionAllocated, MaxSize otherwise. - const uptr Size = (Region == Sci->CurrentRegion) - ? Sci->CurrentRegionAllocated - : MaxSize; - ReleaseRecorder Recorder(Region); - releaseFreeMemoryToOS(Sci->FreeList, Region, Size, BlockSize, - &Recorder); - if (Recorder.getReleasedRangesCount() > 0) { - Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks; - Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount(); - Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes(); - TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes; - } + First = I; + break; + } + } + uptr Last = 0; + for (uptr I = MaxRegionIndex; I >= MinRegionIndex; I--) { + if (PossibleRegions[I] - 1U == ClassId) { + Last = I; + break; + } + } + uptr TotalReleasedBytes = 0; + if (First && Last) { + const uptr Base = First * RegionSize; + const uptr NumberOfRegions = Last - First + 1U; + ReleaseRecorder Recorder(Base); + releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions, + BlockSize, &Recorder); + if (Recorder.getReleasedRangesCount() > 0) { + Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks; + Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount(); + Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes(); + TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes; } } Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime(); diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h index 01e674bf3fba5..ad92ae250e1f4 100644 --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -91,7 +91,7 @@ class SizeClassAllocator64 { if (Region->CanRelease) Region->ReleaseInfo.LastReleaseAtNs = Time; } - setReleaseToOsIntervalMs(ReleaseToOsInterval); + setOption(Option::ReleaseInterval, static_cast(ReleaseToOsInterval)); if (SupportsMemoryTagging) UseMemoryTagging = systemSupportsMemoryTagging(); @@ -185,13 +185,16 @@ class SizeClassAllocator64 { getStats(Str, I, 0); } - void setReleaseToOsIntervalMs(s32 Interval) { - if (Interval >= MaxReleaseToOsIntervalMs) { - Interval = MaxReleaseToOsIntervalMs; - } else if (Interval <= MinReleaseToOsIntervalMs) { - Interval = MinReleaseToOsIntervalMs; + bool setOption(Option O, sptr Value) { + if (O == Option::ReleaseInterval) { + const s32 Interval = + Max(Min(static_cast(Value), MaxReleaseToOsIntervalMs), + MinReleaseToOsIntervalMs); + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + return true; } - atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + // Not supported by the Primary, but not an error either. + return true; } uptr releaseToOS() { @@ -435,10 +438,6 @@ class SizeClassAllocator64 { getRegionBaseByClassId(ClassId)); } - s32 getReleaseToOsIntervalMs() { - return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); - } - NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId, bool Force = false) { const uptr BlockSize = getSizeByClassId(ClassId); @@ -469,7 +468,8 @@ class SizeClassAllocator64 { } if (!Force) { - const s32 IntervalMs = getReleaseToOsIntervalMs(); + const s32 IntervalMs = + atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); if (IntervalMs < 0) return 0; if (Region->ReleaseInfo.LastReleaseAtNs + @@ -481,7 +481,7 @@ class SizeClassAllocator64 { ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data); releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg, - Region->AllocatedUser, BlockSize, &Recorder); + Region->AllocatedUser, 1U, BlockSize, &Recorder); if (Recorder.getReleasedRangesCount() > 0) { Region->ReleaseInfo.PushedBlocksAtLastRelease = diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h index fd55ea24132e6..748e1c0011530 100644 --- a/compiler-rt/lib/scudo/standalone/release.h +++ b/compiler-rt/lib/scudo/standalone/release.h @@ -49,7 +49,10 @@ class ReleaseRecorder { // incremented past MaxValue. class PackedCounterArray { public: - PackedCounterArray(uptr NumCounters, uptr MaxValue) : N(NumCounters) { + PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion, + uptr MaxValue) + : Regions(NumberOfRegions), NumCounters(CountersPerRegion) { + CHECK_GT(Regions, 0); CHECK_GT(NumCounters, 0); CHECK_GT(MaxValue, 0); constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL; @@ -66,9 +69,10 @@ class PackedCounterArray { PackingRatioLog = getLog2(PackingRatio); BitOffsetMask = PackingRatio - 1; - BufferSize = (roundUpTo(N, static_cast(1U) << PackingRatioLog) >> - PackingRatioLog) * - sizeof(*Buffer); + SizePerRegion = + roundUpTo(NumCounters, static_cast(1U) << PackingRatioLog) >> + PackingRatioLog; + BufferSize = SizePerRegion * sizeof(*Buffer) * Regions; if (BufferSize <= (StaticBufferCount * sizeof(Buffer[0])) && Mutex.tryLock()) { Buffer = &StaticBuffer[0]; @@ -89,41 +93,45 @@ class PackedCounterArray { bool isAllocated() const { return !!Buffer; } - uptr getCount() const { return N; } + uptr getCount() const { return NumCounters; } - uptr get(uptr I) const { - DCHECK_LT(I, N); + uptr get(uptr Region, uptr I) const { + DCHECK_LT(Region, Regions); + DCHECK_LT(I, NumCounters); const uptr Index = I >> PackingRatioLog; const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog; - return (Buffer[Index] >> BitOffset) & CounterMask; + return (Buffer[Region * SizePerRegion + Index] >> BitOffset) & CounterMask; } - void inc(uptr I) const { - DCHECK_LT(get(I), CounterMask); + void inc(uptr Region, uptr I) const { + DCHECK_LT(get(Region, I), CounterMask); const uptr Index = I >> PackingRatioLog; const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog; DCHECK_LT(BitOffset, SCUDO_WORDSIZE); - Buffer[Index] += static_cast(1U) << BitOffset; + Buffer[Region * SizePerRegion + Index] += static_cast(1U) + << BitOffset; } - void incRange(uptr From, uptr To) const { + void incRange(uptr Region, uptr From, uptr To) const { DCHECK_LE(From, To); - const uptr Top = Min(To + 1, N); + const uptr Top = Min(To + 1, NumCounters); for (uptr I = From; I < Top; I++) - inc(I); + inc(Region, I); } uptr getBufferSize() const { return BufferSize; } - static const uptr StaticBufferCount = 1024U; + static const uptr StaticBufferCount = 2048U; private: - const uptr N; + const uptr Regions; + const uptr NumCounters; uptr CounterSizeBitsLog; uptr CounterMask; uptr PackingRatioLog; uptr BitOffsetMask; + uptr SizePerRegion; uptr BufferSize; uptr *Buffer; @@ -169,7 +177,8 @@ template class FreePagesRangeTracker { template NOINLINE void releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, - uptr Size, uptr BlockSize, ReleaseRecorderT *Recorder) { + uptr RegionSize, uptr NumberOfRegions, uptr BlockSize, + ReleaseRecorderT *Recorder) { const uptr PageSize = getPageSizeCached(); // Figure out the number of chunks per page and whether we can take a fast @@ -206,13 +215,15 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, } } - const uptr PagesCount = roundUpTo(Size, PageSize) / PageSize; - PackedCounterArray Counters(PagesCount, FullPagesBlockCountMax); + const uptr PagesCount = roundUpTo(RegionSize, PageSize) / PageSize; + PackedCounterArray Counters(NumberOfRegions, PagesCount, + FullPagesBlockCountMax); if (!Counters.isAllocated()) return; const uptr PageSizeLog = getLog2(PageSize); - const uptr RoundedSize = PagesCount << PageSizeLog; + const uptr RoundedRegionSize = PagesCount << PageSizeLog; + const uptr RoundedSize = NumberOfRegions * RoundedRegionSize; // Iterate over free chunks and count how many free chunks affect each // allocated page. @@ -228,14 +239,17 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) { const uptr P = reinterpret_cast(It.get(I)) - Base; // This takes care of P < Base and P >= Base + RoundedSize. - if (P < RoundedSize) - Counters.inc(P >> PageSizeLog); + if (P < RoundedSize) { + const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize; + const uptr PInRegion = P - RegionIndex * RegionSize; + Counters.inc(RegionIndex, PInRegion >> PageSizeLog); + } } } - for (uptr P = Size; P < RoundedSize; P += BlockSize) - Counters.inc(P >> PageSizeLog); } else { // In all other cases chunks might affect more than one page. + DCHECK_GE(RegionSize, BlockSize); + const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize; for (const auto &It : FreeList) { // See TransferBatch comment above. const bool IsTransferBatch = @@ -244,13 +258,24 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) { const uptr P = reinterpret_cast(It.get(I)) - Base; // This takes care of P < Base and P >= Base + RoundedSize. - if (P < RoundedSize) - Counters.incRange(P >> PageSizeLog, - (P + BlockSize - 1) >> PageSizeLog); + if (P < RoundedSize) { + const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize; + uptr PInRegion = P - RegionIndex * RegionSize; + Counters.incRange(RegionIndex, PInRegion >> PageSizeLog, + (PInRegion + BlockSize - 1) >> PageSizeLog); + // The last block in a region might straddle a page, so if it's + // free, we mark the following "pretend" memory block(s) as free. + if (PInRegion == LastBlockInRegion) { + PInRegion += BlockSize; + while (PInRegion < RoundedRegionSize) { + Counters.incRange(RegionIndex, PInRegion >> PageSizeLog, + (PInRegion + BlockSize - 1) >> PageSizeLog); + PInRegion += BlockSize; + } + } + } } } - for (uptr P = Size; P < RoundedSize; P += BlockSize) - Counters.incRange(P >> PageSizeLog, (P + BlockSize - 1) >> PageSizeLog); } // Iterate over pages detecting ranges of pages with chunk Counters equal @@ -258,8 +283,10 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, FreePagesRangeTracker RangeTracker(Recorder); if (SameBlockCountPerPage) { // Fast path, every page has the same number of chunks affecting it. - for (uptr I = 0; I < Counters.getCount(); I++) - RangeTracker.processNextPage(Counters.get(I) == FullPagesBlockCountMax); + for (uptr I = 0; I < NumberOfRegions; I++) + for (uptr J = 0; J < PagesCount; J++) + RangeTracker.processNextPage(Counters.get(I, J) == + FullPagesBlockCountMax); } else { // Slow path, go through the pages keeping count how many chunks affect // each page. @@ -270,23 +297,25 @@ releaseFreeMemoryToOS(const IntrusiveList &FreeList, uptr Base, // except the first and the last one) and then the last chunk size, adding // up the number of chunks on the current page and checking on every step // whether the page boundary was crossed. - uptr PrevPageBoundary = 0; - uptr CurrentBoundary = 0; - for (uptr I = 0; I < Counters.getCount(); I++) { - const uptr PageBoundary = PrevPageBoundary + PageSize; - uptr BlocksPerPage = Pn; - if (CurrentBoundary < PageBoundary) { - if (CurrentBoundary > PrevPageBoundary) - BlocksPerPage++; - CurrentBoundary += Pnc; + for (uptr I = 0; I < NumberOfRegions; I++) { + uptr PrevPageBoundary = 0; + uptr CurrentBoundary = 0; + for (uptr J = 0; J < PagesCount; J++) { + const uptr PageBoundary = PrevPageBoundary + PageSize; + uptr BlocksPerPage = Pn; if (CurrentBoundary < PageBoundary) { - BlocksPerPage++; - CurrentBoundary += BlockSize; + if (CurrentBoundary > PrevPageBoundary) + BlocksPerPage++; + CurrentBoundary += Pnc; + if (CurrentBoundary < PageBoundary) { + BlocksPerPage++; + CurrentBoundary += BlockSize; + } } - } - PrevPageBoundary = PageBoundary; + PrevPageBoundary = PageBoundary; - RangeTracker.processNextPage(Counters.get(I) == BlocksPerPage); + RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage); + } } } RangeTracker.finish(); diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index 84eaa5091b434..b5bb53ddcf2d9 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -56,14 +56,21 @@ class MapAllocatorNoCache { return false; } bool store(UNUSED LargeBlock::Header *H) { return false; } - static bool canCache(UNUSED uptr Size) { return false; } + bool canCache(UNUSED uptr Size) { return false; } void disable() {} void enable() {} void releaseToOS() {} - void setReleaseToOsIntervalMs(UNUSED s32 Interval) {} + bool setOption(Option O, UNUSED sptr Value) { + if (O == Option::ReleaseInterval || O == Option::MaxCacheEntriesCount || + O == Option::MaxCacheEntrySize) + return false; + // Not supported by the Secondary Cache, but not an error either. + return true; + } }; -template class MapAllocatorCache { @@ -71,10 +78,17 @@ class MapAllocatorCache { // Fuchsia doesn't allow releasing Secondary blocks yet. Note that 0 length // arrays are an extension for some compilers. // FIXME(kostyak): support (partially) the cache on Fuchsia. - static_assert(!SCUDO_FUCHSIA || MaxEntriesCount == 0U, ""); + static_assert(!SCUDO_FUCHSIA || EntriesArraySize == 0U, ""); + + // Ensure the default maximum specified fits the array. + static_assert(DefaultMaxEntriesCount <= EntriesArraySize, ""); void initLinkerInitialized(s32 ReleaseToOsInterval) { - setReleaseToOsIntervalMs(ReleaseToOsInterval); + setOption(Option::MaxCacheEntriesCount, + static_cast(DefaultMaxEntriesCount)); + setOption(Option::MaxCacheEntrySize, + static_cast(DefaultMaxEntrySize)); + setOption(Option::ReleaseInterval, static_cast(ReleaseToOsInterval)); } void init(s32 ReleaseToOsInterval) { memset(this, 0, sizeof(*this)); @@ -85,13 +99,14 @@ class MapAllocatorCache { bool EntryCached = false; bool EmptyCache = false; const u64 Time = getMonotonicTime(); + const u32 MaxCount = atomic_load(&MaxEntriesCount, memory_order_relaxed); { ScopedLock L(Mutex); - if (EntriesCount == MaxEntriesCount) { + if (EntriesCount >= MaxCount) { if (IsFullEvents++ == 4U) EmptyCache = true; } else { - for (uptr I = 0; I < MaxEntriesCount; I++) { + for (u32 I = 0; I < MaxCount; I++) { if (Entries[I].Block) continue; if (I != 0) @@ -111,17 +126,19 @@ class MapAllocatorCache { s32 Interval; if (EmptyCache) empty(); - else if ((Interval = getReleaseToOsIntervalMs()) >= 0) + else if ((Interval = atomic_load(&ReleaseToOsIntervalMs, + memory_order_relaxed)) >= 0) releaseOlderThan(Time - static_cast(Interval) * 1000000); return EntryCached; } bool retrieve(uptr Size, LargeBlock::Header **H) { const uptr PageSize = getPageSizeCached(); + const u32 MaxCount = atomic_load(&MaxEntriesCount, memory_order_relaxed); ScopedLock L(Mutex); if (EntriesCount == 0) return false; - for (uptr I = 0; I < MaxEntriesCount; I++) { + for (u32 I = 0; I < MaxCount; I++) { if (!Entries[I].Block) continue; const uptr BlockSize = Entries[I].BlockEnd - Entries[I].Block; @@ -141,17 +158,31 @@ class MapAllocatorCache { return false; } - static bool canCache(uptr Size) { - return MaxEntriesCount != 0U && Size <= MaxEntrySize; + bool canCache(uptr Size) { + return atomic_load(&MaxEntriesCount, memory_order_relaxed) != 0U && + Size <= atomic_load(&MaxEntrySize, memory_order_relaxed); } - void setReleaseToOsIntervalMs(s32 Interval) { - if (Interval >= MaxReleaseToOsIntervalMs) { - Interval = MaxReleaseToOsIntervalMs; - } else if (Interval <= MinReleaseToOsIntervalMs) { - Interval = MinReleaseToOsIntervalMs; + bool setOption(Option O, sptr Value) { + if (O == Option::ReleaseInterval) { + const s32 Interval = + Max(Min(static_cast(Value), MaxReleaseToOsIntervalMs), + MinReleaseToOsIntervalMs); + atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + return true; + } else if (O == Option::MaxCacheEntriesCount) { + const u32 MaxCount = static_cast(Value); + if (MaxCount > EntriesArraySize) + return false; + atomic_store(&MaxEntriesCount, MaxCount, memory_order_relaxed); + return true; + } else if (O == Option::MaxCacheEntrySize) { + atomic_store(&MaxEntrySize, static_cast(Value), + memory_order_relaxed); + return true; } - atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed); + // Not supported by the Secondary Cache, but not an error either. + return true; } void releaseToOS() { releaseOlderThan(UINT64_MAX); } @@ -166,11 +197,11 @@ class MapAllocatorCache { void *MapBase; uptr MapSize; MapPlatformData Data; - } MapInfo[MaxEntriesCount]; + } MapInfo[EntriesArraySize]; uptr N = 0; { ScopedLock L(Mutex); - for (uptr I = 0; I < MaxEntriesCount; I++) { + for (uptr I = 0; I < EntriesArraySize; I++) { if (!Entries[I].Block) continue; MapInfo[N].MapBase = reinterpret_cast(Entries[I].MapBase); @@ -191,7 +222,7 @@ class MapAllocatorCache { ScopedLock L(Mutex); if (!EntriesCount) return; - for (uptr I = 0; I < MaxEntriesCount; I++) { + for (uptr I = 0; I < EntriesArraySize; I++) { if (!Entries[I].Block || !Entries[I].Time || Entries[I].Time > Time) continue; releasePagesToOS(Entries[I].Block, 0, @@ -201,10 +232,6 @@ class MapAllocatorCache { } } - s32 getReleaseToOsIntervalMs() { - return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed); - } - struct CachedBlock { uptr Block; uptr BlockEnd; @@ -215,8 +242,10 @@ class MapAllocatorCache { }; HybridMutex Mutex; - CachedBlock Entries[MaxEntriesCount]; + CachedBlock Entries[EntriesArraySize]; u32 EntriesCount; + atomic_u32 MaxEntriesCount; + atomic_uptr MaxEntrySize; uptr LargestSize; u32 IsFullEvents; atomic_s32 ReleaseToOsIntervalMs; @@ -265,11 +294,9 @@ template class MapAllocator { Callback(reinterpret_cast(&H) + LargeBlock::getHeaderSize()); } - static uptr canCache(uptr Size) { return CacheT::canCache(Size); } + uptr canCache(uptr Size) { return Cache.canCache(Size); } - void setReleaseToOsIntervalMs(s32 Interval) { - Cache.setReleaseToOsIntervalMs(Interval); - } + bool setOption(Option O, sptr Value) { return Cache.setOption(O, Value); } void releaseToOS() { Cache.releaseToOS(); } @@ -306,7 +333,7 @@ void *MapAllocator::allocate(uptr Size, uptr AlignmentHint, const uptr RoundedSize = roundUpTo(Size + LargeBlock::getHeaderSize(), PageSize); - if (AlignmentHint < PageSize && CacheT::canCache(RoundedSize)) { + if (AlignmentHint < PageSize && Cache.canCache(RoundedSize)) { LargeBlock::Header *H; if (Cache.retrieve(RoundedSize, &H)) { if (BlockEnd) @@ -400,7 +427,7 @@ template void MapAllocator::deallocate(void *Ptr) { Stats.sub(StatAllocated, CommitSize); Stats.sub(StatMapped, H->MapSize); } - if (CacheT::canCache(CommitSize) && Cache.store(H)) + if (Cache.canCache(CommitSize) && Cache.store(H)) return; void *Addr = reinterpret_cast(H->MapBase); const uptr Size = H->MapSize; diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index b5035d05aabb0..9689c4265e06c 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -12,13 +12,14 @@ #include "combined.h" #include +#include #include #include #include static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; static constexpr scudo::Chunk::Origin Origin = scudo::Chunk::Origin::Malloc; @@ -77,7 +78,7 @@ template struct TestAllocator : scudo::Allocator { template static void testAllocator() { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); EXPECT_FALSE(Allocator->isOwned(&Mutex)); EXPECT_FALSE(Allocator->isOwned(&Allocator)); @@ -350,8 +351,9 @@ template static void stressAllocator(AllocatorT *A) { } template static void testAllocatorThreaded() { + Ready = false; using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); std::thread Threads[32]; for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) Threads[I] = std::thread(stressAllocator, Allocator.get()); @@ -393,12 +395,12 @@ struct DeathConfig { typedef scudo::SizeClassAllocator64 Primary; typedef scudo::MapAllocator Secondary; - template using TSDRegistryT = scudo::TSDRegistrySharedT; + template using TSDRegistryT = scudo::TSDRegistrySharedT; }; TEST(ScudoCombinedTest, DeathCombined) { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); const scudo::uptr Size = 1000U; void *P = Allocator->allocate(Size, Origin); @@ -433,7 +435,7 @@ TEST(ScudoCombinedTest, DeathCombined) { // operation without issue. TEST(ScudoCombinedTest, ReleaseToOS) { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); Allocator->releaseToOS(); } @@ -442,7 +444,7 @@ TEST(ScudoCombinedTest, ReleaseToOS) { // fulfill the allocation through a larger size class. TEST(ScudoCombinedTest, FullRegion) { using AllocatorT = TestAllocator; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); std::vector V; scudo::uptr FailedAllocationsCount = 0; @@ -473,7 +475,7 @@ TEST(ScudoCombinedTest, FullRegion) { TEST(ScudoCombinedTest, OddEven) { using AllocatorT = TestAllocator; using SizeClassMap = AllocatorT::PrimaryT::SizeClassMap; - auto Allocator = std::make_unique(); + auto Allocator = std::unique_ptr(new AllocatorT()); if (!Allocator->useMemoryTagging()) return; diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp index 010bf84490e0e..a7a2b3160611e 100644 --- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp @@ -149,7 +149,7 @@ TEST(ScudoPrimaryTest, PrimaryIterate) { static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; template static void performAllocations(Primary *Allocator) { static THREADLOCAL typename Primary::CacheT Cache; @@ -176,6 +176,7 @@ template static void performAllocations(Primary *Allocator) { } template static void testPrimaryThreaded() { + Ready = false; auto Deleter = [](Primary *P) { P->unmapTestOnly(); delete P; diff --git a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp index a7478f47479d5..a693b97f80da6 100644 --- a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp @@ -21,14 +21,14 @@ TEST(ScudoReleaseTest, PackedCounterArray) { for (scudo::uptr I = 0; I < SCUDO_WORDSIZE; I++) { // Various valid counter's max values packed into one word. - scudo::PackedCounterArray Counters2N(1, 1UL << I); + scudo::PackedCounterArray Counters2N(1U, 1U, 1UL << I); EXPECT_EQ(sizeof(scudo::uptr), Counters2N.getBufferSize()); // Check the "all bit set" values too. - scudo::PackedCounterArray Counters2N1_1(1, ~0UL >> I); + scudo::PackedCounterArray Counters2N1_1(1U, 1U, ~0UL >> I); EXPECT_EQ(sizeof(scudo::uptr), Counters2N1_1.getBufferSize()); // Verify the packing ratio, the counter is Expected to be packed into the // closest power of 2 bits. - scudo::PackedCounterArray Counters(SCUDO_WORDSIZE, 1UL << I); + scudo::PackedCounterArray Counters(1U, SCUDO_WORDSIZE, 1UL << I); EXPECT_EQ(sizeof(scudo::uptr) * scudo::roundUpToPowerOfTwo(I + 1), Counters.getBufferSize()); } @@ -38,19 +38,20 @@ TEST(ScudoReleaseTest, PackedCounterArray) { // Make sure counters request one memory page for the buffer. const scudo::uptr NumCounters = (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I); - scudo::PackedCounterArray Counters(NumCounters, 1UL << ((1UL << I) - 1)); - Counters.inc(0); + scudo::PackedCounterArray Counters(1U, NumCounters, + 1UL << ((1UL << I) - 1)); + Counters.inc(0U, 0U); for (scudo::uptr C = 1; C < NumCounters - 1; C++) { - EXPECT_EQ(0UL, Counters.get(C)); - Counters.inc(C); - EXPECT_EQ(1UL, Counters.get(C - 1)); + EXPECT_EQ(0UL, Counters.get(0U, C)); + Counters.inc(0U, C); + EXPECT_EQ(1UL, Counters.get(0U, C - 1)); } - EXPECT_EQ(0UL, Counters.get(NumCounters - 1)); - Counters.inc(NumCounters - 1); + EXPECT_EQ(0UL, Counters.get(0U, NumCounters - 1)); + Counters.inc(0U, NumCounters - 1); if (I > 0) { - Counters.incRange(0, NumCounters - 1); + Counters.incRange(0u, 0U, NumCounters - 1); for (scudo::uptr C = 0; C < NumCounters; C++) - EXPECT_EQ(2UL, Counters.get(C)); + EXPECT_EQ(2UL, Counters.get(0U, C)); } } } @@ -190,7 +191,7 @@ template void testReleaseFreeMemoryToOS() { // Release the memory. ReleasedPagesRecorder Recorder; - releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, BlockSize, + releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, 1U, BlockSize, &Recorder); // Verify that there are no released pages touched by used chunks and all diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp index d2260b9c15b07..29efdb3060128 100644 --- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp @@ -21,7 +21,7 @@ template static void testSecondaryBasic(void) { scudo::GlobalStats S; S.init(); - SecondaryT *L = new SecondaryT; + std::unique_ptr L(new SecondaryT); L->init(&S); const scudo::uptr Size = 1U << 16; void *P = L->allocate(Size); @@ -30,7 +30,7 @@ template static void testSecondaryBasic(void) { EXPECT_GE(SecondaryT::getBlockSize(P), Size); L->deallocate(P); // If the Secondary can't cache that pointer, it will be unmapped. - if (!SecondaryT::canCache(Size)) + if (!L->canCache(Size)) EXPECT_DEATH(memset(P, 'A', Size), ""); const scudo::uptr Align = 1U << 16; @@ -59,7 +59,7 @@ TEST(ScudoSecondaryTest, SecondaryBasic) { #if !SCUDO_FUCHSIA testSecondaryBasic>>(); testSecondaryBasic< - scudo::MapAllocator>>(); + scudo::MapAllocator>>(); #endif } @@ -75,7 +75,7 @@ using LargeAllocator = scudo::MapAllocator>; TEST(ScudoSecondaryTest, SecondaryCombinations) { constexpr scudo::uptr MinAlign = FIRST_32_SECOND_64(8, 16); constexpr scudo::uptr HeaderSize = scudo::roundUpTo(8, MinAlign); - LargeAllocator *L = new LargeAllocator; + std::unique_ptr L(new LargeAllocator); L->init(nullptr); for (scudo::uptr SizeLog = 0; SizeLog <= 20; SizeLog++) { for (scudo::uptr AlignLog = FIRST_32_SECOND_64(3, 4); AlignLog <= 16; @@ -103,7 +103,7 @@ TEST(ScudoSecondaryTest, SecondaryCombinations) { } TEST(ScudoSecondaryTest, SecondaryIterate) { - LargeAllocator *L = new LargeAllocator; + std::unique_ptr L(new LargeAllocator); L->init(nullptr); std::vector V; const scudo::uptr PageSize = scudo::getPageSizeCached(); @@ -125,9 +125,32 @@ TEST(ScudoSecondaryTest, SecondaryIterate) { Str.output(); } +TEST(ScudoSecondaryTest, SecondaryOptions) { + std::unique_ptr L(new LargeAllocator); + L->init(nullptr); + // Attempt to set a maximum number of entries higher than the array size. + EXPECT_FALSE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4096U)); + // A negative number will be cast to a scudo::u32, and fail. + EXPECT_FALSE(L->setOption(scudo::Option::MaxCacheEntriesCount, -1)); + if (L->canCache(0U)) { + // Various valid combinations. + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4U)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20)); + EXPECT_TRUE(L->canCache(1UL << 18)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 17)); + EXPECT_FALSE(L->canCache(1UL << 18)); + EXPECT_TRUE(L->canCache(1UL << 16)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 0U)); + EXPECT_FALSE(L->canCache(1UL << 16)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4U)); + EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20)); + EXPECT_TRUE(L->canCache(1UL << 16)); + } +} + static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; static void performAllocations(LargeAllocator *L) { std::vector V; @@ -153,11 +176,12 @@ static void performAllocations(LargeAllocator *L) { } TEST(ScudoSecondaryTest, SecondaryThreadsRace) { - LargeAllocator *L = new LargeAllocator; + Ready = false; + std::unique_ptr L(new LargeAllocator); L->init(nullptr, /*ReleaseToOsInterval=*/0); std::thread Threads[16]; for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) - Threads[I] = std::thread(performAllocations, L); + Threads[I] = std::thread(performAllocations, L.get()); { std::unique_lock Lock(Mutex); Ready = true; diff --git a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp index 4a3cf1cd0fc31..561bda47e24c8 100644 --- a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp @@ -13,6 +13,7 @@ #include #include +#include #include // We mock out an allocator with a TSD registry, mostly using empty stubs. The @@ -47,12 +48,12 @@ template class MockAllocator { struct OneCache { template - using TSDRegistryT = scudo::TSDRegistrySharedT; + using TSDRegistryT = scudo::TSDRegistrySharedT; }; struct SharedCaches { template - using TSDRegistryT = scudo::TSDRegistrySharedT; + using TSDRegistryT = scudo::TSDRegistrySharedT; }; struct ExclusiveCaches { @@ -116,7 +117,7 @@ TEST(ScudoTSDTest, TSDRegistryBasic) { static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; template static void stressCache(AllocatorT *Allocator) { auto Registry = Allocator->getTSDRegistry(); @@ -145,6 +146,7 @@ template static void stressCache(AllocatorT *Allocator) { } template static void testRegistryThreaded() { + Ready = false; auto Deleter = [](AllocatorT *A) { A->unmapTestOnly(); delete A; @@ -171,3 +173,73 @@ TEST(ScudoTSDTest, TSDRegistryThreaded) { testRegistryThreaded>(); #endif } + +static std::set Pointers; + +static void stressSharedRegistry(MockAllocator *Allocator) { + std::set Set; + auto Registry = Allocator->getTSDRegistry(); + { + std::unique_lock Lock(Mutex); + while (!Ready) + Cv.wait(Lock); + } + Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false); + bool UnlockRequired; + for (scudo::uptr I = 0; I < 4096U; I++) { + auto TSD = Registry->getTSDAndLock(&UnlockRequired); + EXPECT_NE(TSD, nullptr); + Set.insert(reinterpret_cast(TSD)); + if (UnlockRequired) + TSD->unlock(); + } + { + std::unique_lock Lock(Mutex); + Pointers.insert(Set.begin(), Set.end()); + } +} + +TEST(ScudoTSDTest, TSDRegistryTSDsCount) { + Ready = false; + using AllocatorT = MockAllocator; + auto Deleter = [](AllocatorT *A) { + A->unmapTestOnly(); + delete A; + }; + std::unique_ptr Allocator(new AllocatorT, + Deleter); + Allocator->reset(); + // We attempt to use as many TSDs as the shared cache offers by creating a + // decent amount of threads that will be run concurrently and attempt to get + // and lock TSDs. We put them all in a set and count the number of entries + // after we are done. + std::thread Threads[32]; + for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) + Threads[I] = std::thread(stressSharedRegistry, Allocator.get()); + { + std::unique_lock Lock(Mutex); + Ready = true; + Cv.notify_all(); + } + for (auto &T : Threads) + T.join(); + // The initial number of TSDs we get will be the minimum of the default count + // and the number of CPUs. + EXPECT_LE(Pointers.size(), 8U); + Pointers.clear(); + auto Registry = Allocator->getTSDRegistry(); + // Increase the number of TSDs to 16. + Registry->setOption(scudo::Option::MaxTSDsCount, 16); + Ready = false; + for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) + Threads[I] = std::thread(stressSharedRegistry, Allocator.get()); + { + std::unique_lock Lock(Mutex); + Ready = true; + Cv.notify_all(); + } + for (auto &T : Threads) + T.join(); + // We should get 16 distinct TSDs back. + EXPECT_EQ(Pointers.size(), 16U); +} diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp index 4b851a7af181f..3ddc4ec818327 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp @@ -202,6 +202,12 @@ TEST(ScudoWrappersCTest, MallOpt) { EXPECT_EQ(mallopt(M_DECAY_TIME, 0), 1); EXPECT_EQ(mallopt(M_DECAY_TIME, 1), 1); EXPECT_EQ(mallopt(M_DECAY_TIME, 0), 1); + + if (SCUDO_ANDROID) { + EXPECT_EQ(mallopt(M_CACHE_COUNT_MAX, 100), 1); + EXPECT_EQ(mallopt(M_CACHE_SIZE_MAX, 1024 * 1024 * 2), 1); + EXPECT_EQ(mallopt(M_TSDS_COUNT_MAX, 10), 1); + } } #endif @@ -389,6 +395,7 @@ static void *enableMalloc(void *Unused) { TEST(ScudoWrappersCTest, DisableForkEnable) { pthread_t ThreadId; + Ready = false; EXPECT_EQ(pthread_create(&ThreadId, nullptr, &enableMalloc, nullptr), 0); // Wait for the thread to be warmed up. diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp index 4ccef5bb0deec..d24b6651d95e3 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp @@ -79,7 +79,7 @@ TEST(ScudoWrappersCppTest, New) { static std::mutex Mutex; static std::condition_variable Cv; -static bool Ready = false; +static bool Ready; static void stressNew() { std::vector V; @@ -103,6 +103,7 @@ static void stressNew() { } TEST(ScudoWrappersCppTest, ThreadedNew) { + Ready = false; std::thread Threads[32]; for (size_t I = 0U; I < sizeof(Threads) / sizeof(Threads[0]); I++) Threads[I] = std::thread(stressNew); diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h index 3492509b5a8eb..ac5a22c970701 100644 --- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h +++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h @@ -66,6 +66,12 @@ template struct TSDRegistryExT { Mutex.unlock(); } + bool setOption(Option O, UNUSED sptr Value) { + if (O == Option::MaxTSDsCount) + return false; + return true; + } + private: void initOnceMaybe(Allocator *Instance) { ScopedLock L(Mutex); diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h index 038a5905ff489..25ba191826c3f 100644 --- a/compiler-rt/lib/scudo/standalone/tsd_shared.h +++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h @@ -14,31 +14,16 @@ namespace scudo { -template struct TSDRegistrySharedT { +template +struct TSDRegistrySharedT { void initLinkerInitialized(Allocator *Instance) { Instance->initLinkerInitialized(); CHECK_EQ(pthread_key_create(&PThreadKey, nullptr), 0); // For non-TLS - const u32 NumberOfCPUs = getNumberOfCPUs(); - NumberOfTSDs = (SCUDO_ANDROID || NumberOfCPUs == 0) - ? MaxTSDCount - : Min(NumberOfCPUs, MaxTSDCount); - for (u32 I = 0; I < NumberOfTSDs; I++) + for (u32 I = 0; I < TSDsArraySize; I++) TSDs[I].initLinkerInitialized(Instance); - // Compute all the coprimes of NumberOfTSDs. This will be used to walk the - // array of TSDs in a random order. For details, see: - // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/ - for (u32 I = 0; I < NumberOfTSDs; I++) { - u32 A = I + 1; - u32 B = NumberOfTSDs; - // Find the GCD between I + 1 and NumberOfTSDs. If 1, they are coprimes. - while (B != 0) { - const u32 T = A; - A = B; - B = T % B; - } - if (A == 1) - CoPrimes[NumberOfCoPrimes++] = I + 1; - } + const u32 NumberOfCPUs = getNumberOfCPUs(); + setNumberOfTSDs((NumberOfCPUs == 0) ? DefaultTSDCount + : Min(NumberOfCPUs, DefaultTSDCount)); Initialized = true; } void init(Allocator *Instance) { @@ -66,21 +51,34 @@ template struct TSDRegistrySharedT { if (TSD->tryLock()) return TSD; // If that fails, go down the slow path. + if (TSDsArraySize == 1U) { + // Only 1 TSD, not need to go any further. + // The compiler will optimize this one way or the other. + TSD->lock(); + return TSD; + } return getTSDAndLockSlow(TSD); } void disable() { Mutex.lock(); - for (u32 I = 0; I < NumberOfTSDs; I++) + for (u32 I = 0; I < TSDsArraySize; I++) TSDs[I].lock(); } void enable() { - for (s32 I = static_cast(NumberOfTSDs - 1); I >= 0; I--) + for (s32 I = static_cast(TSDsArraySize - 1); I >= 0; I--) TSDs[I].unlock(); Mutex.unlock(); } + bool setOption(Option O, sptr Value) { + if (O == Option::MaxTSDsCount) + return setNumberOfTSDs(static_cast(Value)); + // Not supported by the TSD Registry, but not an error either. + return true; + } + private: ALWAYS_INLINE void setCurrentTSD(TSD *CurrentTSD) { #if _BIONIC @@ -104,6 +102,32 @@ template struct TSDRegistrySharedT { #endif } + bool setNumberOfTSDs(u32 N) { + ScopedLock L(MutexTSDs); + if (N < NumberOfTSDs) + return false; + if (N > TSDsArraySize) + N = TSDsArraySize; + NumberOfTSDs = N; + NumberOfCoPrimes = 0; + // Compute all the coprimes of NumberOfTSDs. This will be used to walk the + // array of TSDs in a random order. For details, see: + // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/ + for (u32 I = 0; I < N; I++) { + u32 A = I + 1; + u32 B = N; + // Find the GCD between I + 1 and N. If 1, they are coprimes. + while (B != 0) { + const u32 T = A; + A = B; + B = T % B; + } + if (A == 1) + CoPrimes[NumberOfCoPrimes++] = I + 1; + } + return true; + } + void initOnceMaybe(Allocator *Instance) { ScopedLock L(Mutex); if (LIKELY(Initialized)) @@ -120,17 +144,23 @@ template struct TSDRegistrySharedT { } NOINLINE TSD *getTSDAndLockSlow(TSD *CurrentTSD) { - if (MaxTSDCount > 1U && NumberOfTSDs > 1U) { - // Use the Precedence of the current TSD as our random seed. Since we are - // in the slow path, it means that tryLock failed, and as a result it's - // very likely that said Precedence is non-zero. - const u32 R = static_cast(CurrentTSD->getPrecedence()); - const u32 Inc = CoPrimes[R % NumberOfCoPrimes]; - u32 Index = R % NumberOfTSDs; + // Use the Precedence of the current TSD as our random seed. Since we are + // in the slow path, it means that tryLock failed, and as a result it's + // very likely that said Precedence is non-zero. + const u32 R = static_cast(CurrentTSD->getPrecedence()); + u32 N, Inc; + { + ScopedLock L(MutexTSDs); + N = NumberOfTSDs; + DCHECK_NE(NumberOfCoPrimes, 0U); + Inc = CoPrimes[R % NumberOfCoPrimes]; + } + if (N > 1U) { + u32 Index = R % N; uptr LowestPrecedence = UINTPTR_MAX; TSD *CandidateTSD = nullptr; // Go randomly through at most 4 contexts and find a candidate. - for (u32 I = 0; I < Min(4U, NumberOfTSDs); I++) { + for (u32 I = 0; I < Min(4U, N); I++) { if (TSDs[Index].tryLock()) { setCurrentTSD(&TSDs[Index]); return &TSDs[Index]; @@ -142,8 +172,8 @@ template struct TSDRegistrySharedT { LowestPrecedence = Precedence; } Index += Inc; - if (Index >= NumberOfTSDs) - Index -= NumberOfTSDs; + if (Index >= N) + Index -= N; } if (CandidateTSD) { CandidateTSD->lock(); @@ -160,19 +190,20 @@ template struct TSDRegistrySharedT { atomic_u32 CurrentIndex; u32 NumberOfTSDs; u32 NumberOfCoPrimes; - u32 CoPrimes[MaxTSDCount]; + u32 CoPrimes[TSDsArraySize]; bool Initialized; HybridMutex Mutex; - TSD TSDs[MaxTSDCount]; + HybridMutex MutexTSDs; + TSD TSDs[TSDsArraySize]; #if SCUDO_LINUX && !_BIONIC static THREADLOCAL TSD *ThreadTSD; #endif }; #if SCUDO_LINUX && !_BIONIC -template +template THREADLOCAL TSD - *TSDRegistrySharedT::ThreadTSD; + *TSDRegistrySharedT::ThreadTSD; #endif } // namespace scudo diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc index da5a170569d7c..b25135b1ce1b9 100644 --- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc +++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc @@ -155,7 +155,7 @@ void SCUDO_PREFIX(malloc_postinit)() { SCUDO_PREFIX(malloc_enable)); } -INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, UNUSED int value) { +INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, int value) { if (param == M_DECAY_TIME) { if (SCUDO_ANDROID) { if (value == 0) { @@ -173,11 +173,26 @@ INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, UNUSED int value) { } else if (param == M_PURGE) { SCUDO_ALLOCATOR.releaseToOS(); return 1; - } else if (param == M_MEMTAG_TUNING) { - return SCUDO_ALLOCATOR.setOption(scudo::Option::MemtagTuning, - static_cast(value)); + } else { + scudo::Option option; + switch (param) { + case M_MEMTAG_TUNING: + option = scudo::Option::MemtagTuning; + break; + case M_CACHE_COUNT_MAX: + option = scudo::Option::MaxCacheEntriesCount; + break; + case M_CACHE_SIZE_MAX: + option = scudo::Option::MaxCacheEntrySize; + break; + case M_TSDS_COUNT_MAX: + option = scudo::Option::MaxTSDsCount; + break; + default: + return 0; + } + return SCUDO_ALLOCATOR.setOption(option, static_cast(value)); } - return 0; } INTERFACE WEAK void *SCUDO_PREFIX(aligned_alloc)(size_t alignment, diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp index 9c3e0369bc6cd..d485af8b60520 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp @@ -31,6 +31,8 @@ #include "tsan_mman.h" #include "tsan_fd.h" +#include + using namespace __tsan; #if SANITIZER_FREEBSD || SANITIZER_MAC @@ -135,6 +137,7 @@ const int PTHREAD_BARRIER_SERIAL_THREAD = -1; #endif const int MAP_FIXED = 0x10; typedef long long_t; +typedef __sanitizer::u16 mode_t; // From /usr/include/unistd.h # define F_ULOCK 0 /* Unlock a previously locked region. */ @@ -1508,20 +1511,28 @@ TSAN_INTERCEPTOR(int, fstat64, int fd, void *buf) { #define TSAN_MAYBE_INTERCEPT_FSTAT64 #endif -TSAN_INTERCEPTOR(int, open, const char *name, int flags, int mode) { - SCOPED_TSAN_INTERCEPTOR(open, name, flags, mode); +TSAN_INTERCEPTOR(int, open, const char *name, int oflag, ...) { + va_list ap; + va_start(ap, oflag); + mode_t mode = va_arg(ap, int); + va_end(ap); + SCOPED_TSAN_INTERCEPTOR(open, name, oflag, mode); READ_STRING(thr, pc, name, 0); - int fd = REAL(open)(name, flags, mode); + int fd = REAL(open)(name, oflag, mode); if (fd >= 0) FdFileCreate(thr, pc, fd); return fd; } #if SANITIZER_LINUX -TSAN_INTERCEPTOR(int, open64, const char *name, int flags, int mode) { - SCOPED_TSAN_INTERCEPTOR(open64, name, flags, mode); +TSAN_INTERCEPTOR(int, open64, const char *name, int oflag, ...) { + va_list ap; + va_start(ap, oflag); + mode_t mode = va_arg(ap, int); + va_end(ap); + SCOPED_TSAN_INTERCEPTOR(open64, name, oflag, mode); READ_STRING(thr, pc, name, 0); - int fd = REAL(open64)(name, flags, mode); + int fd = REAL(open64)(name, oflag, mode); if (fd >= 0) FdFileCreate(thr, pc, fd); return fd; diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt index dca02a65e9713..b5342f2e9e645 100644 --- a/compiler-rt/lib/ubsan/CMakeLists.txt +++ b/compiler-rt/lib/ubsan/CMakeLists.txt @@ -200,7 +200,7 @@ else() CFLAGS ${UBSAN_CXXFLAGS} PARENT_TARGET ubsan) - if (FUCHSIA OR UNIX) + if (COMPILER_RT_HAS_VERSION_SCRIPT) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") add_compiler_rt_object_libraries(RTUbsan_dynamic_version_script_dummy ARCHS ${UBSAN_SUPPORTED_ARCH} @@ -216,7 +216,7 @@ else() -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/clang_rt.ubsan_standalone-dynamic-${arch}.vers) # The Solaris 11.4 linker supports a subset of GNU ld version scripts, # but requires a special option to enable it. - if (OS_NAME MATCHES "SunOS") + if (COMPILER_RT_HAS_GNU_VERSION_SCRIPT_COMPAT) list(APPEND VERSION_SCRIPT_FLAG -Wl,-z,gnu-version-script-compat) endif() set_property(SOURCE diff --git a/compiler-rt/test/asan/TestCases/Linux/activation-options.cpp b/compiler-rt/test/asan/TestCases/Linux/activation-options.cpp index 0074a6699d39e..59c4d5c46e771 100644 --- a/compiler-rt/test/asan/TestCases/Linux/activation-options.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/activation-options.cpp @@ -1,6 +1,8 @@ // Test for ASAN_OPTIONS=start_deactivated=1 mode. // Main executable is uninstrumented, but linked to ASan runtime. The shared // library is instrumented. +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46862 +// XFAIL: !compiler-rt-optimized // RUN: %clangxx_asan -O0 -DSHARED_LIB %s -fPIC -shared -o %t-so.so // RUN: %clangxx -O0 %s -c -o %t.o diff --git a/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp b/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp index c450ae5bb8e91..c294ca53b6be5 100644 --- a/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp @@ -32,7 +32,7 @@ // AArch64 bots fail on this test. // TODO(alekseys): Android lit do not run ulimit on device. // REQUIRES: shadow-scale-3 -// UNSUPPORTED: s390,android,aarch64 +// UNSUPPORTED: s390,android,aarch64,powerpc64le #include #include diff --git a/compiler-rt/test/asan/TestCases/Linux/odr-violation.cpp b/compiler-rt/test/asan/TestCases/Linux/odr-violation.cpp index 9cdafca91b1f3..a14b59b67f114 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr-violation.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr-violation.cpp @@ -1,5 +1,7 @@ // FIXME: https://code.google.com/p/address-sanitizer/issues/detail?id=316 // XFAIL: android +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46862 +// XFAIL: !compiler-rt-optimized // // We use fast_unwind_on_malloc=0 to have full unwinding even w/o frame // pointers. This setting is not on by default because it's too expensive. diff --git a/compiler-rt/test/asan/TestCases/Linux/odr-vtable.cpp b/compiler-rt/test/asan/TestCases/Linux/odr-vtable.cpp index fdbab4bb1fd82..eb0805784df1a 100644 --- a/compiler-rt/test/asan/TestCases/Linux/odr-vtable.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/odr-vtable.cpp @@ -1,3 +1,6 @@ +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46862 +// XFAIL: !compiler-rt-optimized + // RUN: %clangxx_asan -fno-rtti -DBUILD_SO1 -fPIC -shared %s -o %dynamiclib1 // RUN: %clangxx_asan -fno-rtti -DBUILD_SO2 -fPIC -shared %s -o %dynamiclib2 // RUN: %clangxx_asan -fno-rtti %s %ld_flags_rpath_exe1 %ld_flags_rpath_exe2 -o %t diff --git a/compiler-rt/test/asan/TestCases/Posix/start-deactivated.cpp b/compiler-rt/test/asan/TestCases/Posix/start-deactivated.cpp index 9c674ac57e46a..c4f5f8c74a024 100644 --- a/compiler-rt/test/asan/TestCases/Posix/start-deactivated.cpp +++ b/compiler-rt/test/asan/TestCases/Posix/start-deactivated.cpp @@ -1,6 +1,8 @@ // Test for ASAN_OPTIONS=start_deactivated=1 mode. // Main executable is uninstrumented, but linked to ASan runtime. The shared // library is instrumented. Memory errors before dlopen are not detected. +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46862 +// XFAIL: !compiler-rt-optimized // RUN: %clangxx_asan -O0 -DSHARED_LIB %s -std=c++11 -fPIC -shared -o %t-so.so // RUN: %clangxx -O0 %s -std=c++11 -c -o %t.o diff --git a/compiler-rt/test/asan/TestCases/handle_noreturn_bug.cpp b/compiler-rt/test/asan/TestCases/handle_noreturn_bug.cpp index 8c3c66a423d50..1639ad7d0859b 100644 --- a/compiler-rt/test/asan/TestCases/handle_noreturn_bug.cpp +++ b/compiler-rt/test/asan/TestCases/handle_noreturn_bug.cpp @@ -1,4 +1,6 @@ // Regression test: __asan_handle_no_return should unpoison stack even with poison_heap=0. +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46862 +// XFAIL: !compiler-rt-optimized // RUN: %clangxx_asan -O0 %s -o %t && \ // RUN: %env_asan_opts=poison_heap=1 %run %t && \ // RUN: %env_asan_opts=poison_heap=0 %run %t diff --git a/compiler-rt/test/asan/TestCases/strstr_strict.c b/compiler-rt/test/asan/TestCases/strstr_strict.c index 7d2c4ec2f3258..0d9482723f5f6 100644 --- a/compiler-rt/test/asan/TestCases/strstr_strict.c +++ b/compiler-rt/test/asan/TestCases/strstr_strict.c @@ -3,8 +3,8 @@ // Newer versions of Android's strstr() uses memchr() internally, which actually // does trigger a heap-buffer-overflow (as it tries to find the -// null-terminator). -// UNSUPPORTED: android +// null-terminator). The same applies to FreeBSD. +// UNSUPPORTED: android, freebsd // RUN: %env_asan_opts=strict_string_checks=false %run %t 2>&1 // RUN: %env_asan_opts=strict_string_checks=true not %run %t 2>&1 | FileCheck %s diff --git a/compiler-rt/test/dfsan/fast16labels.c b/compiler-rt/test/dfsan/fast16labels.c index 269d1e2efc0af..72aaf38bfed42 100644 --- a/compiler-rt/test/dfsan/fast16labels.c +++ b/compiler-rt/test/dfsan/fast16labels.c @@ -1,14 +1,8 @@ -// RUN: %clang_dfsan %s -o %t -// RUN: DFSAN_OPTIONS=fast16labels=1 %run %t -// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_create_label 2>&1 \ -// RUN: | FileCheck %s --check-prefix=CREATE-LABEL -// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_get_label_info 2>&1 \ -// RUN: | FileCheck %s --check-prefix=GET-LABEL-INFO -// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_has_label_with_desc \ -// RUN: 2>&1 | FileCheck %s --check-prefix=HAS-LABEL-WITH-DESC -// -// Tests DFSAN_OPTIONS=fast16labels=1 +// RUN: %clang_dfsan %s -mllvm -dfsan-fast-16-labels -o %t +// RUN: %run %t // +// Tests fast16labels mode. + #include #include @@ -20,19 +14,6 @@ int foo(int a, int b) { } int main(int argc, char *argv[]) { - // Death tests for unsupported API usage. - const char *command = (argc < 2) ? "" : argv[1]; - // CREATE-LABEL: FATAL: DataFlowSanitizer: dfsan_create_label is unsupported - if (strcmp(command, "dfsan_create_label") == 0) - dfsan_create_label("", NULL); - // GET-LABEL-INFO: FATAL: DataFlowSanitizer: dfsan_get_label_info is unsupported - if (strcmp(command, "dfsan_get_label_info") == 0) - dfsan_get_label_info(1); - // HAS-LABEL-WITH-DESC: FATAL: DataFlowSanitizer: dfsan_has_label_with_desc is unsupported - if (strcmp(command, "dfsan_has_label_with_desc") == 0) - dfsan_has_label_with_desc(1, ""); - - // Supported usage. int a = 10; int b = 20; dfsan_set_label(8, &a, sizeof(a)); @@ -43,7 +24,4 @@ int main(int argc, char *argv[]) { dfsan_label l = dfsan_get_label(c); printf("C: 0x%x\n", l); assert(l == 520); // OR of the other two labels. - assert(dfsan_has_label(l, 8)); - assert(dfsan_has_label(l, 512)); - assert(!dfsan_has_label(l, 1)); } diff --git a/compiler-rt/test/fuzzer/bcmp.test b/compiler-rt/test/fuzzer/bcmp.test index 37ee6bedd4ee1..8c1e532b16aba 100644 --- a/compiler-rt/test/fuzzer/bcmp.test +++ b/compiler-rt/test/fuzzer/bcmp.test @@ -1,4 +1,4 @@ -UNSUPPORTED: freebsd +UNSUPPORTED: darwin, freebsd, windows RUN: %cpp_compiler -DMEMCMP=bcmp %S/MemcmpTest.cpp -o %t RUN: not %run %t -seed=1 -runs=10000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/dataflow.test b/compiler-rt/test/fuzzer/dataflow.test index 93cd580b9d52e..391f160d0044a 100644 --- a/compiler-rt/test/fuzzer/dataflow.test +++ b/compiler-rt/test/fuzzer/dataflow.test @@ -2,10 +2,10 @@ REQUIRES: linux, x86_64 # Build the tracer and the test. -RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fsanitize=dataflow %S/../../lib/fuzzer/dataflow/DataFlow.cpp -o %t-DataFlow.o +RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fsanitize=dataflow -mllvm -dfsan-fast-16-labels %S/../../lib/fuzzer/dataflow/DataFlow.cpp -o %t-DataFlow.o RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fPIC %S/../../lib/fuzzer/dataflow/DataFlowCallbacks.cpp -o %t-DataFlowCallbacks.o -RUN: %no_fuzzer_cpp_compiler -fno-sanitize=all -fsanitize=dataflow -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp %S/ThreeFunctionsTest.cpp %t-DataFlow*.o -o %t-ThreeFunctionsTestDF -RUN: %no_fuzzer_cpp_compiler -fno-sanitize=all -fsanitize=dataflow -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp %S/Labels20Test.cpp %t-DataFlow*.o -o %t-Labels20TestDF +RUN: %no_fuzzer_cpp_compiler -fno-sanitize=all -fsanitize=dataflow -mllvm -dfsan-fast-16-labels -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp %S/ThreeFunctionsTest.cpp %t-DataFlow*.o -o %t-ThreeFunctionsTestDF +RUN: %no_fuzzer_cpp_compiler -fno-sanitize=all -fsanitize=dataflow -mllvm -dfsan-fast-16-labels -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp %S/Labels20Test.cpp %t-DataFlow*.o -o %t-Labels20TestDF RUN: %cpp_compiler %S/ThreeFunctionsTest.cpp -o %t-ThreeFunctionsTest # Dump the function list. @@ -30,7 +30,7 @@ RUN: echo -n FUZxxxxxxxxxxxxxxxxx > %t/IN20/FUZxxxxxxxxxxxxxxxxx RUN: echo -n FUxxxxxxxxxxxxxxxxxx > %t/IN20/FUxxxxxxxxxxxxxxxxxx -RUN: export DFSAN_OPTIONS=fast16labels=1:warn_unimplemented=0 +RUN: export DFSAN_OPTIONS=warn_unimplemented=0 # This test assumes that the functions in ThreeFunctionsTestDF are instrumented # in a specific order: diff --git a/compiler-rt/test/fuzzer/memcmp64.test b/compiler-rt/test/fuzzer/memcmp64.test index feb81d1e4fb28..24d14bf73bbf4 100644 --- a/compiler-rt/test/fuzzer/memcmp64.test +++ b/compiler-rt/test/fuzzer/memcmp64.test @@ -1,8 +1,4 @@ UNSUPPORTED: freebsd RUN: %cpp_compiler %S/Memcmp64BytesTest.cpp -o %t-Memcmp64BytesTest RUN: not %run %t-Memcmp64BytesTest -seed=1 -runs=1000000 2>&1 | FileCheck %s - -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-memcmp %S/Memcmp64BytesTest.cpp -o %t-NoAsanMemcmp64BytesTest -RUN: not %run %t-NoAsanMemcmp64BytesTest -seed=1 -runs=1000000 2>&1 | FileCheck %s - CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-bcmp.test b/compiler-rt/test/fuzzer/noasan-bcmp.test new file mode 100644 index 0000000000000..a3dd17bf6e2b8 --- /dev/null +++ b/compiler-rt/test/fuzzer/noasan-bcmp.test @@ -0,0 +1,4 @@ +UNSUPPORTED: darwin, freebsd, windows +RUN: %cpp_compiler -fno-sanitize=address -DMEMCMP=bcmp %S/MemcmpTest.cpp -o %t +RUN: not %run %t -seed=1 -runs=10000000 2>&1 | FileCheck %s +CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-memcmp.test b/compiler-rt/test/fuzzer/noasan-memcmp.test index c90755c53a900..c5ce2fff8c9fa 100644 --- a/compiler-rt/test/fuzzer/noasan-memcmp.test +++ b/compiler-rt/test/fuzzer/noasan-memcmp.test @@ -1,9 +1,9 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-memcmp %S/MemcmpTest.cpp -o %t-NoAsanMemcmpTest +RUN: %cpp_compiler -fno-sanitize=address %S/MemcmpTest.cpp -o %t-NoAsanMemcmpTest RUN: not %run %t-NoAsanMemcmpTest -seed=1 -runs=10000000 2>&1 | FileCheck %s -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc -fno-builtin-memcmp %S/CustomAllocator.cpp %S/MemcmpTest.cpp -o %t-NoAsanCustomAllocatorMemcmpTest +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc %S/CustomAllocator.cpp %S/MemcmpTest.cpp -o %t-NoAsanCustomAllocatorMemcmpTest RUN: not %run %t-NoAsanCustomAllocatorMemcmpTest -seed=1 -runs=10000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-memcmp64.test b/compiler-rt/test/fuzzer/noasan-memcmp64.test new file mode 100644 index 0000000000000..496ee386193e9 --- /dev/null +++ b/compiler-rt/test/fuzzer/noasan-memcmp64.test @@ -0,0 +1,6 @@ +UNSUPPORTED: darwin, freebsd, windows + +RUN: %cpp_compiler -fno-sanitize=address %S/Memcmp64BytesTest.cpp -o %t-NoAsanMemcmp64BytesTest +RUN: not %run %t-NoAsanMemcmp64BytesTest -seed=1 -runs=1000000 2>&1 | FileCheck %s + +CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-strcmp.test b/compiler-rt/test/fuzzer/noasan-strcmp.test index 76b7c5de7c7bb..c264dec6bea2f 100644 --- a/compiler-rt/test/fuzzer/noasan-strcmp.test +++ b/compiler-rt/test/fuzzer/noasan-strcmp.test @@ -1,9 +1,9 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-strcmp %S/StrcmpTest.cpp -o %t-NoAsanStrcmpTest +RUN: %cpp_compiler -fno-sanitize=address %S/StrcmpTest.cpp -o %t-NoAsanStrcmpTest RUN: not %run %t-NoAsanStrcmpTest -seed=1 -runs=2000000 2>&1 | FileCheck %s -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc -fno-builtin-strcmp %S/CustomAllocator.cpp %S/StrcmpTest.cpp -o %t-NoAsanCustomAllocatorStrcmpTest +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc %S/CustomAllocator.cpp %S/StrcmpTest.cpp -o %t-NoAsanCustomAllocatorStrcmpTest RUN: not %run %t-NoAsanCustomAllocatorStrcmpTest -seed=1 -runs=2000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-strncmp.test b/compiler-rt/test/fuzzer/noasan-strncmp.test index 705781ec39586..dd0f254609221 100644 --- a/compiler-rt/test/fuzzer/noasan-strncmp.test +++ b/compiler-rt/test/fuzzer/noasan-strncmp.test @@ -1,9 +1,9 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-strncmp %S/StrncmpTest.cpp -o %t-NoAsanStrncmpTest +RUN: %cpp_compiler -fno-sanitize=address %S/StrncmpTest.cpp -o %t-NoAsanStrncmpTest RUN: not %run %t-NoAsanStrncmpTest -seed=2 -runs=10000000 2>&1 | FileCheck %s -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc -fno-builtin-strncmp %S/CustomAllocator.cpp %S/StrncmpTest.cpp -o %t-NoAsanCustomAllocatorStrncmpTest +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc %S/CustomAllocator.cpp %S/StrncmpTest.cpp -o %t-NoAsanCustomAllocatorStrncmpTest RUN: not %run %t-NoAsanCustomAllocatorStrncmpTest -seed=2 -runs=10000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/noasan-strstr.test b/compiler-rt/test/fuzzer/noasan-strstr.test index f06e903149bd9..e969170bfac98 100644 --- a/compiler-rt/test/fuzzer/noasan-strstr.test +++ b/compiler-rt/test/fuzzer/noasan-strstr.test @@ -1,9 +1,9 @@ UNSUPPORTED: darwin, freebsd, windows -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-strstr %S/StrstrTest.cpp -o %t-NoAsanStrstrTest +RUN: %cpp_compiler -fno-sanitize=address %S/StrstrTest.cpp -o %t-NoAsanStrstrTest RUN: not %run %t-NoAsanStrstrTest -seed=1 -runs=2000000 2>&1 | FileCheck %s -RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc -fno-builtin-strstr %S/CustomAllocator.cpp %S/StrstrTest.cpp -o %t-NoAsanCustomAllocatorStrstrTest +RUN: %cpp_compiler -fno-sanitize=address -fno-builtin-calloc %S/CustomAllocator.cpp %S/StrstrTest.cpp -o %t-NoAsanCustomAllocatorStrstrTest RUN: not %run %t-NoAsanCustomAllocatorStrstrTest -seed=1 -runs=2000000 2>&1 | FileCheck %s CHECK: BINGO diff --git a/compiler-rt/test/fuzzer/only-some-bytes-fork.test b/compiler-rt/test/fuzzer/only-some-bytes-fork.test index ef03c24fd2c63..a4142dbbb3b9a 100644 --- a/compiler-rt/test/fuzzer/only-some-bytes-fork.test +++ b/compiler-rt/test/fuzzer/only-some-bytes-fork.test @@ -2,9 +2,9 @@ REQUIRES: linux, x86_64 # Build the tracer and the test. -RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fsanitize=dataflow %S/../../lib/fuzzer/dataflow/DataFlow.cpp -o %t-DataFlow.o +RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fsanitize=dataflow -mllvm -dfsan-fast-16-labels %S/../../lib/fuzzer/dataflow/DataFlow.cpp -o %t-DataFlow.o RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fPIC %S/../../lib/fuzzer/dataflow/DataFlowCallbacks.cpp -o %t-DataFlowCallbacks.o -RUN: %no_fuzzer_cpp_compiler -fno-sanitize=all -fsanitize=dataflow -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp %S/OnlySomeBytesTest.cpp %t-DataFlow*.o -o %t-DFT +RUN: %no_fuzzer_cpp_compiler -fno-sanitize=all -fsanitize=dataflow -mllvm -dfsan-fast-16-labels -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp %S/OnlySomeBytesTest.cpp %t-DataFlow*.o -o %t-DFT RUN: %cpp_compiler %S/OnlySomeBytesTest.cpp -o %t-Fuzz # Test that the fork mode can collect and use the DFT diff --git a/compiler-rt/test/fuzzer/only-some-bytes.test b/compiler-rt/test/fuzzer/only-some-bytes.test index 62703f398f76b..9a31ebfc93c1e 100644 --- a/compiler-rt/test/fuzzer/only-some-bytes.test +++ b/compiler-rt/test/fuzzer/only-some-bytes.test @@ -2,9 +2,9 @@ REQUIRES: linux, x86_64 # Build the tracer and the test. -RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fsanitize=dataflow %S/../../lib/fuzzer/dataflow/DataFlow.cpp -o %t-DataFlow.o +RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fsanitize=dataflow -mllvm -dfsan-fast-16-labels %S/../../lib/fuzzer/dataflow/DataFlow.cpp -o %t-DataFlow.o RUN: %no_fuzzer_cpp_compiler -c -fno-sanitize=all -fPIC %S/../../lib/fuzzer/dataflow/DataFlowCallbacks.cpp -o %t-DataFlowCallbacks.o -RUN: %no_fuzzer_cpp_compiler -fno-sanitize=all -fsanitize=dataflow -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp %S/OnlySomeBytesTest.cpp %t-DataFlow*.o -o %t-DFT +RUN: %no_fuzzer_cpp_compiler -fno-sanitize=all -fsanitize=dataflow -mllvm -dfsan-fast-16-labels -fsanitize-coverage=trace-pc-guard,pc-table,bb,trace-cmp %S/OnlySomeBytesTest.cpp %t-DataFlow*.o -o %t-DFT RUN: %cpp_compiler %S/OnlySomeBytesTest.cpp -o %t-Fuzz # Prepare the inputs. diff --git a/compiler-rt/test/hwasan/TestCases/exported-tagged-global.c b/compiler-rt/test/hwasan/TestCases/exported-tagged-global.c new file mode 100644 index 0000000000000..198d8781f68af --- /dev/null +++ b/compiler-rt/test/hwasan/TestCases/exported-tagged-global.c @@ -0,0 +1,16 @@ +// RUN: %clang_hwasan %s -o %t +// RUN: %run %t +// RUN: %clang_hwasan -O1 %s -o %t +// RUN: %run %t +// RUN: %clang_hwasan -O1 -mllvm --aarch64-enable-global-isel-at-O=1 %s -o %t +// RUN: %run %t + +static int global; + +__attribute__((optnone)) int *address_of_global() { return &global; } + +int main(int argc, char **argv) { + int *global_address = address_of_global(); + *global_address = 13; + return 0; +} diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index fdc28a4637ff7..d773f3766fd33 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -23,6 +23,9 @@ # bash on Windows is usually very slow. execute_external = (not sys.platform in ['win32']) +# Allow expanding substitutions that are based on other substitutions +config.recursiveExpansionLimit = 10 + # Setup test format. config.test_format = lit.formats.ShTest(execute_external) if execute_external: diff --git a/compiler-rt/test/msan/libatomic.c b/compiler-rt/test/msan/libatomic.c index 5d8aa9ab6a60f..a8c030b7dbb26 100644 --- a/compiler-rt/test/msan/libatomic.c +++ b/compiler-rt/test/msan/libatomic.c @@ -1,6 +1,6 @@ -// RUN: %clangxx_msan -fsanitize-memory-track-origins=2 -latomic -DTEST_STORE -O0 %s -o %t && %run %t 2>&1 -// RUN: %clangxx_msan -fsanitize-memory-track-origins=0 -latomic -DTEST_LOAD -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK -// RUN: %clangxx_msan -fsanitize-memory-track-origins=2 -latomic -DTEST_LOAD -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SHADOW +// RUN: %clang_msan -fsanitize-memory-track-origins=2 -latomic -DTEST_STORE -O0 %s -o %t && %run %t 2>&1 +// RUN: %clang_msan -fsanitize-memory-track-origins=0 -latomic -DTEST_LOAD -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK +// RUN: %clang_msan -fsanitize-memory-track-origins=2 -latomic -DTEST_LOAD -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SHADOW // PPC has no libatomic // UNSUPPORTED: powerpc64-target-arch diff --git a/compiler-rt/test/profile/Inputs/instrprof-comdat.h b/compiler-rt/test/profile/Inputs/instrprof-comdat.h index 61e283cc878ed..956496ee1361a 100644 --- a/compiler-rt/test/profile/Inputs/instrprof-comdat.h +++ b/compiler-rt/test/profile/Inputs/instrprof-comdat.h @@ -18,6 +18,6 @@ template T FOO::DoIt(T ti) { // HEADER: [[@LINE]]| 2|template if (I > ti / 2) // HEADER: [[@LINE]]| 20| if (I > ti t -= 1; // HEADER: [[@LINE]]| 8| t -= 1; } // HEADER: [[@LINE]]| 10| } - // HEADER: [[@LINE]]| 1| + // HEADER: [[@LINE]]| | return t; // HEADER: [[@LINE]]| 1| return t; } diff --git a/compiler-rt/test/profile/Linux/instrprof-value-merge.c b/compiler-rt/test/profile/Linux/instrprof-value-merge.c index 902430a4a9686..2619a1d00336e 100644 --- a/compiler-rt/test/profile/Linux/instrprof-value-merge.c +++ b/compiler-rt/test/profile/Linux/instrprof-value-merge.c @@ -45,7 +45,7 @@ int main(int argc, char *argv[]) { // CHECK: Counters: // CHECK: main: -// CHECK: Hash: 0x00030012a7ab6e87 +// CHECK: Hash: 0x0a9bd81e87ab6e87 // CHECK: Counters: 6 // CHECK: Indirect Call Site Count: 3 // CHECK: Number of Memory Intrinsics Calls: 3 diff --git a/compiler-rt/test/profile/coverage_comments.cpp b/compiler-rt/test/profile/coverage_comments.cpp new file mode 100644 index 0000000000000..0cf78add7913b --- /dev/null +++ b/compiler-rt/test/profile/coverage_comments.cpp @@ -0,0 +1,71 @@ +// RUN: %clangxx_profgen -fcoverage-mapping -Wno-comment -o %t %s +// RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t +// RUN: llvm-profdata merge -o %t.profdata %t.profraw +// RUN: llvm-cov show %t -instr-profile %t.profdata -path-equivalence=/tmp,%S 2>&1 | FileCheck %s + +int main() { // CHECK: [[# @LINE]]| 1|int main() { + /* comment */ int x = 0; // CHECK-NEXT: [[# @LINE]]| 1| + int y = 0; /* comment */ // CHECK-NEXT: [[# @LINE]]| 1| + int z = 0; // comment // CHECK-NEXT: [[# @LINE]]| 1| + // comment // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + x = 0; /* // CHECK-NEXT: [[# @LINE]]| 1| + comment // CHECK-NEXT: [[# @LINE]]| | + */ // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */ x = 0; // CHECK-NEXT: [[# @LINE]]| 1| + // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + // comment // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + z = // CHECK-NEXT: [[# @LINE]]| 1| + x // comment // CHECK-NEXT: [[# @LINE]]| 1| + // comment // CHECK-NEXT: [[# @LINE]]| | + + /* // CHECK-NEXT: [[# @LINE]]| 1| + comment // CHECK-NEXT: [[# @LINE]]| | + */ // CHECK-NEXT: [[# @LINE]]| | + /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */y; // CHECK-NEXT: [[# @LINE]]| 1| + // CHECK-NEXT: [[# @LINE]]| | + // Comments inside directives. // CHECK-NEXT: [[# @LINE]]| | + #if 0 //comment // CHECK-NEXT: [[# @LINE]]| | + /* comment */ x = 0; // CHECK-NEXT: [[# @LINE]]| | + y = 0; /* comment */ // CHECK-NEXT: [[# @LINE]]| | + z = 0; // comment // CHECK-NEXT: [[# @LINE]]| | + // comment // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + x = 0; /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */ // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */ x = 0; // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + // comment // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + #endif // comment // CHECK-NEXT: [[# @LINE]]| | + #if 1 // comment // CHECK-NEXT: [[# @LINE]]| 1| + /* comment */ x = 0; // CHECK-NEXT: [[# @LINE]]| 1| + y = 0; /* comment */ // CHECK-NEXT: [[# @LINE]]| 1| + z = 0; // comment // CHECK-NEXT: [[# @LINE]]| 1| + // comment // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + x = 0; /* // CHECK-NEXT: [[# @LINE]]| 1| + comment // CHECK-NEXT: [[# @LINE]]| | + */ // CHECK-NEXT: [[# @LINE]]| | + // CHECK-NEXT: [[# @LINE]]| | + /* // CHECK-NEXT: [[# @LINE]]| | + comment // CHECK-NEXT: [[# @LINE]]| | + */ x = 0; // CHECK-NEXT: [[# @LINE]]| 1| + // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + // comment // CHECK-NEXT: [[# @LINE]]| | + /* comment */ // CHECK-NEXT: [[# @LINE]]| | + #endif //comment // CHECK-NEXT: [[# @LINE]]| 1| + return 0; // CHECK-NEXT: [[# @LINE]]| 1| +} // CHECK-NEXT: [[# @LINE]]| 1| diff --git a/compiler-rt/test/profile/instrprof-set-file-object-merging.c b/compiler-rt/test/profile/instrprof-set-file-object-merging.c index 0ca5f6ff9ed95..35e9becf228f0 100644 --- a/compiler-rt/test/profile/instrprof-set-file-object-merging.c +++ b/compiler-rt/test/profile/instrprof-set-file-object-merging.c @@ -34,7 +34,7 @@ int main(int argc, const char *argv[]) { // CHECK: 17| 2| // CHECK: 18| 2| FILE *F = fopen(argv[1], "r+b"); // CHECK: 19| 2| if (!F) { -// CHECK: 20| 1| // File might not exist, try opening with truncation +// CHECK: 20| | // File might not exist, try opening with truncation // CHECK: 21| 1| F = fopen(argv[1], "w+b"); // CHECK: 22| 1| } // CHECK: 23| 2| __llvm_profile_set_file_object(F, 1); diff --git a/compiler-rt/test/profile/instrprof-without-libc.c b/compiler-rt/test/profile/instrprof-without-libc.c index 6e9c1dde01e6a..cd9fb5e1dd8e0 100644 --- a/compiler-rt/test/profile/instrprof-without-libc.c +++ b/compiler-rt/test/profile/instrprof-without-libc.c @@ -72,3 +72,4 @@ int main(int argc, const char *argv[]) { // CHECK-SYMBOLS-NOT: {{ }}_free // CHECK-SYMBOLS-NOT: {{ }}free // CHECK-SYMBOLS-NOT: {{ }}_open +// CHECK-SYMBOLS-NOT: {{ }}_getpagesize diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp index bd58f4bd92654..03ddf06ae2d41 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp @@ -1,5 +1,7 @@ // Regression test for // https://code.google.com/p/address-sanitizer/issues/detail?id=180 +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46860 +// XFAIL: !compiler-rt-optimized && tsan // RUN: %clangxx -O0 %s -o %t diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp index 1c9ea982b95b2..208ece3e05af4 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp @@ -1,4 +1,6 @@ // Test line numbers in signal handlers +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46860 +// XFAIL: !compiler-rt-optimized && tsan // RUN: %clangxx %s -o %t -O0 // RUN: %env_tool_opts=handle_segv=1:print_stacktrace=1 not %run %t 1 2>&1 | FileCheck --check-prefixes=CHECK1,CHECK %s diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/xdrrec.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/xdrrec.cpp index f39bed3da3763..cc50a650f751b 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/xdrrec.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/xdrrec.cpp @@ -1,5 +1,5 @@ // RUN: %clangxx -O0 %s -o %t && %run %t | FileCheck %s -// REQUIRES: !android +// REQUIRES: sunrpc, !android #include #include diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/dedup_token_length_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/dedup_token_length_test.cpp index 94c50be169b48..1630003e519bf 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/dedup_token_length_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/dedup_token_length_test.cpp @@ -1,4 +1,6 @@ // Test dedup_token_length +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46860 +// XFAIL: !compiler-rt-optimized && tsan // RUN: %clangxx -O0 %s -o %t // RUN: env %tool_options='abort_on_error=0' not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK0 --match-full-lines // RUN: env %tool_options='abort_on_error=0, dedup_token_length=0' not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK0 --match-full-lines diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/illegal_read_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/illegal_read_test.cpp index 9615d7132da5e..2addb9bd47508 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/illegal_read_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/illegal_read_test.cpp @@ -1,4 +1,6 @@ // Test that there was an illegal READ memory access. +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46860 +// XFAIL: !compiler-rt-optimized && tsan // RUN: %clangxx -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s // REQUIRES: stable-runtime diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/illegal_write_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/illegal_write_test.cpp index 13d1c6a06905a..531b4d294b8fb 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/illegal_write_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/illegal_write_test.cpp @@ -1,4 +1,6 @@ // Test that there was an illegal WRITE memory access. +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46860 +// XFAIL: !compiler-rt-optimized && tsan // RUN: %clangxx -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s // REQUIRES: stable-runtime diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp index cc7de193f0a41..6ba7025bf7578 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_fd_test.cpp @@ -1,4 +1,6 @@ // Test __sanitizer_set_report_fd: +// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46860 +// XFAIL: !compiler-rt-optimized && tsan // RUN: %clangxx -O2 %s -o %t // RUN: not %run %t 2>&1 | FileCheck %s // RUN: not %run %t stdout | FileCheck %s diff --git a/compiler-rt/test/tsan/Darwin/variadic-open.cpp b/compiler-rt/test/tsan/Darwin/variadic-open.cpp new file mode 100644 index 0000000000000..cd7684005ceca --- /dev/null +++ b/compiler-rt/test/tsan/Darwin/variadic-open.cpp @@ -0,0 +1,24 @@ +// RUN: %clangxx_tsan -O1 %s -o %t && %run %t %t.tmp 2>&1 | FileCheck %s +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + fprintf(stderr, "Hello world.\n"); + assert(argv[1]); + unlink(argv[1]); + int fd = open(argv[1], O_RDWR | O_CREAT, 0600); + assert(fd != -1); + struct stat info; + int result = fstat(fd, &info); + fprintf(stderr, "permissions = 0%o\n", info.st_mode & ~S_IFMT); + assert(result == 0); + close(fd); + fprintf(stderr, "Done.\n"); +} + +// CHECK: Hello world. +// CHECK: permissions = 0600 +// CHECK: Done. diff --git a/compiler-rt/test/ubsan/TestCases/Misc/nonnull.cpp b/compiler-rt/test/ubsan/TestCases/Misc/nonnull.cpp index d5cd2bf763b7c..c612cd8247156 100644 --- a/compiler-rt/test/ubsan/TestCases/Misc/nonnull.cpp +++ b/compiler-rt/test/ubsan/TestCases/Misc/nonnull.cpp @@ -1,5 +1,5 @@ // RUN: %clangxx -fsanitize=returns-nonnull-attribute -w %s -O3 -o %t -// RUN: %run %t foo 2>&1 | count 0 +// RUN: %run %t foo 2>&1 | FileCheck %s --check-prefix=NOERROR --allow-empty --implicit-check-not='runtime error' // RUN: %run %t 2>&1 | FileCheck %s // RUN: %clangxx -fsanitize=returns-nonnull-attribute -fno-sanitize-recover=returns-nonnull-attribute -w %s -O3 -o %t.abort // RUN: not %run %t.abort &> /dev/null @@ -40,3 +40,5 @@ int main(int argc, char **argv) { return 0; } + +// NOERROR-NOT: runtime error diff --git a/compiler-rt/test/ubsan/TestCases/Misc/nullability.c b/compiler-rt/test/ubsan/TestCases/Misc/nullability.c index a68d0f8ea3628..42837983865d8 100644 --- a/compiler-rt/test/ubsan/TestCases/Misc/nullability.c +++ b/compiler-rt/test/ubsan/TestCases/Misc/nullability.c @@ -1,6 +1,6 @@ // UNSUPPORTED: android // RUN: %clang -w -fsanitize=nullability-arg,nullability-assign,nullability-return %s -O3 -o %t -// RUN: %run %t foo 2>&1 | count 0 +// RUN: %run %t foo 2>&1 | FileCheck %s --check-prefix=NOERROR --allow-empty --implicit-check-not='runtime error' // RUN: %run %t 2>&1 | FileCheck %s // RUN: echo "nullability-arg:nullability.c" > %t.supp @@ -67,3 +67,5 @@ int main(int argc, char **argv) { nonnull_init2(p); return 0; } + +// NOERROR-NOT: runtime error diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 9dd6281d410bf..f1aaa5c6473fe 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -380,7 +380,6 @@ endif() add_subdirectory(runtime) if (FLANG_INCLUDE_TESTS) - enable_testing() add_subdirectory(test) if (FLANG_GTEST_AVAIL) add_subdirectory(unittests) diff --git a/flang/include/flang/Decimal/decimal.h b/flang/include/flang/Decimal/decimal.h index fa687e92d35b4..214b1d15d44b3 100644 --- a/flang/include/flang/Decimal/decimal.h +++ b/flang/include/flang/Decimal/decimal.h @@ -44,12 +44,11 @@ struct ConversionToDecimalResult { }; enum FortranRounding { - RoundNearest, /* RN */ + RoundNearest, /* RN and RP */ RoundUp, /* RU */ RoundDown, /* RD */ RoundToZero, /* RZ - no rounding */ RoundCompatible, /* RC: like RN, but ties go away from 0 */ - RoundDefault, /* RP: maps to one of the above */ }; /* The "minimize" flag causes the fewest number of output digits diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h index 1617bdd086dcb..a25916f94ef79 100644 --- a/flang/include/flang/Evaluate/constant.h +++ b/flang/include/flang/Evaluate/constant.h @@ -54,7 +54,7 @@ std::size_t TotalElementCount(const ConstantSubscripts &); std::optional> ValidateDimensionOrder( int rank, const std::vector &order); -bool IsValidShape(const ConstantSubscripts &); +bool HasNegativeExtent(const ConstantSubscripts &); class ConstantBounds { public: diff --git a/flang/include/flang/Evaluate/intrinsics.h b/flang/include/flang/Evaluate/intrinsics.h index 88d6a7af13eb7..09f5691b1ea76 100644 --- a/flang/include/flang/Evaluate/intrinsics.h +++ b/flang/include/flang/Evaluate/intrinsics.h @@ -76,6 +76,11 @@ class IntrinsicProcTable { // Inquiry intrinsics are defined in section 16.7, table 16.1 IntrinsicClass GetIntrinsicClass(const std::string &) const; + // Return the generic name of a specific intrinsic name. + // The name provided is returned if it is a generic intrinsic name or is + // not known to be an intrinsic. + std::string GetGenericIntrinsicName(const std::string &) const; + // Probe the intrinsics for a match against a specific call. // On success, the actual arguments are transferred to the result // in dummy argument order; on failure, the actual arguments remain @@ -87,6 +92,11 @@ class IntrinsicProcTable { std::optional IsSpecificIntrinsicFunction( const std::string &) const; + // Illegal name for an intrinsic used to avoid cascading error messages when + // constant folding. + static const inline std::string InvalidName{ + "(invalid intrinsic function call)"}; + llvm::raw_ostream &Dump(llvm::raw_ostream &) const; private: diff --git a/flang/include/flang/Optimizer/CodeGen/CMakeLists.txt b/flang/include/flang/Optimizer/CodeGen/CMakeLists.txt index ab6526ee18330..9acf6f89e12f5 100644 --- a/flang/include/flang/Optimizer/CodeGen/CMakeLists.txt +++ b/flang/include/flang/Optimizer/CodeGen/CMakeLists.txt @@ -1,6 +1,6 @@ set(LLVM_TARGET_DEFINITIONS CGPasses.td) -mlir_tablegen(CGPasses.h.inc -gen-pass-decls) +mlir_tablegen(CGPasses.h.inc -gen-pass-decls -name OptCodeGen) add_public_tablegen_target(FIROptCodeGenPassIncGen) add_mlir_doc(Passes -gen-pass-doc OptimizerCodeGenPasses ./) diff --git a/flang/include/flang/Optimizer/CodeGen/CodeGen.h b/flang/include/flang/Optimizer/CodeGen/CodeGen.h index 9b968172f3486..a90d0a50dac64 100644 --- a/flang/include/flang/Optimizer/CodeGen/CodeGen.h +++ b/flang/include/flang/Optimizer/CodeGen/CodeGen.h @@ -28,12 +28,9 @@ std::unique_ptr createFIRToLLVMPass(NameUniquer &uniquer); std::unique_ptr createLLVMDialectToLLVMPass(llvm::raw_ostream &output); -inline void registerOptCodeGenPasses() { - using mlir::Pass; // declarative passes #define GEN_PASS_REGISTRATION #include "flang/Optimizer/CodeGen/CGPasses.h.inc" -} } // namespace fir diff --git a/flang/include/flang/Optimizer/Transforms/CMakeLists.txt b/flang/include/flang/Optimizer/Transforms/CMakeLists.txt index fde17eb88622e..b928991e0a37e 100644 --- a/flang/include/flang/Optimizer/Transforms/CMakeLists.txt +++ b/flang/include/flang/Optimizer/Transforms/CMakeLists.txt @@ -1,6 +1,6 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) -mlir_tablegen(Passes.h.inc -gen-pass-decls) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name OptTransform) add_public_tablegen_target(FIROptTransformsPassIncGen) add_mlir_doc(Passes -gen-pass-doc OptimizerTransformPasses ./) diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index 9377c2dc61cc3..5e71995736e6a 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -46,12 +46,9 @@ std::unique_ptr createMemToRegPass(); bool canLegallyInline(mlir::Operation *op, mlir::Region *reg, mlir::BlockAndValueMapping &map); -inline void registerOptTransformPasses() { -using mlir::Pass; // declarative passes #define GEN_PASS_REGISTRATION #include "flang/Optimizer/Transforms/Passes.h.inc" -} } // namespace fir diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h index a67a008889276..5ebe5f32eb677 100644 --- a/flang/include/flang/Semantics/scope.h +++ b/flang/include/flang/Semantics/scope.h @@ -87,6 +87,7 @@ class Scope { bool IsModule() const; // only module, not submodule bool IsSubmodule() const; bool IsDerivedType() const { return kind_ == Kind::DerivedType; } + bool IsStmtFunction() const; bool IsParameterizedDerivedType() const; Symbol *symbol() { return symbol_; } const Symbol *symbol() const { return symbol_; } diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h index 227f951aef5f0..c0a50364b63db 100644 --- a/flang/include/flang/Semantics/symbol.h +++ b/flang/include/flang/Semantics/symbol.h @@ -219,10 +219,7 @@ class ProcEntityDetails : public EntityDetails, public WithPassArg { const ProcInterface &interface() const { return interface_; } ProcInterface &interface() { return interface_; } - void set_interface(const ProcInterface &interface) { - CHECK(!IsInterfaceSet()); - interface_ = interface; - } + void set_interface(const ProcInterface &interface) { interface_ = interface; } bool IsInterfaceSet() { return interface_.symbol() != nullptr || interface_.type() != nullptr; } @@ -482,6 +479,7 @@ class Symbol { Error, // an error has been reported on this symbol Function, // symbol is a function Subroutine, // symbol is a subroutine + StmtFunction, // symbol is a statement function (Function is set too) Implicit, // symbol is implicitly typed ModFile, // symbol came from .mod file ParentComp, // symbol is the "parent component" of an extended type @@ -491,6 +489,13 @@ class Symbol { LocalityShared, // named in SHARED locality-spec InDataStmt, // initialized in a DATA statement + // OpenACC data-sharing attribute + AccPrivate, AccFirstPrivate, AccShared, + // OpenACC data-mapping attribute + AccCopyIn, AccCopyOut, AccCreate, AccDelete, AccPresent, + // OpenACC miscellaneous flags + AccCommonBlock, AccThreadPrivate, AccReduction, AccNone, AccPreDetermined, + // OpenMP data-sharing attribute OmpShared, OmpPrivate, OmpLinear, OmpFirstPrivate, OmpLastPrivate, // OpenMP data-mapping attribute diff --git a/flang/lib/Decimal/big-radix-floating-point.h b/flang/lib/Decimal/big-radix-floating-point.h index 2fbb777104d6d..b0ee69ad5e426 100644 --- a/flang/lib/Decimal/big-radix-floating-point.h +++ b/flang/lib/Decimal/big-radix-floating-point.h @@ -66,12 +66,12 @@ template class BigRadixFloatingPointNumber { public: explicit BigRadixFloatingPointNumber( - enum FortranRounding rounding = RoundDefault) + enum FortranRounding rounding = RoundNearest) : rounding_{rounding} {} // Converts a binary floating point value. explicit BigRadixFloatingPointNumber( - Real, enum FortranRounding = RoundDefault); + Real, enum FortranRounding = RoundNearest); BigRadixFloatingPointNumber &SetToZero() { isNegative_ = false; @@ -355,7 +355,7 @@ template class BigRadixFloatingPointNumber { int digitLimit_{maxDigits}; // precision clamp int exponent_{0}; // signed power of ten bool isNegative_{false}; - enum FortranRounding rounding_ { RoundDefault }; + enum FortranRounding rounding_ { RoundNearest }; }; } // namespace Fortran::decimal #endif diff --git a/flang/lib/Decimal/binary-to-decimal.cpp b/flang/lib/Decimal/binary-to-decimal.cpp index bcc0f08558aa4..c89bffc8ccd4c 100644 --- a/flang/lib/Decimal/binary-to-decimal.cpp +++ b/flang/lib/Decimal/binary-to-decimal.cpp @@ -143,7 +143,6 @@ BigRadixFloatingPointNumber::ConvertToDecimal(char *buffer, bool incr{false}; switch (rounding_) { case RoundNearest: - case RoundDefault: incr = *end > '5' || (*end == '5' && (p > end + 1 || ((end[-1] - '0') & 1) != 0)); break; diff --git a/flang/lib/Decimal/decimal-to-binary.cpp b/flang/lib/Decimal/decimal-to-binary.cpp index 502f0a003d641..5e927e93b3bbe 100644 --- a/flang/lib/Decimal/decimal-to-binary.cpp +++ b/flang/lib/Decimal/decimal-to-binary.cpp @@ -150,7 +150,6 @@ void BigRadixFloatingPointNumber radix / 2 || (LSD == radix / 2 && digit_[0] % 2 != 0); break; case RoundUp: @@ -260,7 +259,6 @@ ConversionToBinaryResult IntermediateFloat::ToBinary( bool incr{false}; switch (rounding) { case RoundNearest: - case RoundDefault: incr = guard > oneHalf || (guard == oneHalf && (fraction & 1)); break; case RoundUp: diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp index 9ac1a12e0f4ea..a7cc094033d05 100644 --- a/flang/lib/Evaluate/check-expression.cpp +++ b/flang/lib/Evaluate/check-expression.cpp @@ -43,7 +43,10 @@ class IsConstantExprHelper : public AllTraverse { } template bool operator()(const FunctionRef &call) const { if (const auto *intrinsic{std::get_if(&call.proc().u)}) { - return intrinsic->name == "kind"; + // kind is always a constant, and we avoid cascading errors by calling + // invalid calls to intrinsics constant + return intrinsic->name == "kind" || + intrinsic->name == IntrinsicProcTable::InvalidName; // TODO: other inquiry intrinsics } else { return false; diff --git a/flang/lib/Evaluate/constant.cpp b/flang/lib/Evaluate/constant.cpp index e7497630f5969..5b73979f1e2c2 100644 --- a/flang/lib/Evaluate/constant.cpp +++ b/flang/lib/Evaluate/constant.cpp @@ -93,13 +93,13 @@ std::optional> ValidateDimensionOrder( } } -bool IsValidShape(const ConstantSubscripts &shape) { +bool HasNegativeExtent(const ConstantSubscripts &shape) { for (ConstantSubscript extent : shape) { if (extent < 0) { - return false; + return true; } } - return shape.size() <= common::maxRank; + return false; } template diff --git a/flang/lib/Evaluate/fold-complex.cpp b/flang/lib/Evaluate/fold-complex.cpp index ea96f045aa3e8..8058b2dbd13c4 100644 --- a/flang/lib/Evaluate/fold-complex.cpp +++ b/flang/lib/Evaluate/fold-complex.cpp @@ -36,7 +36,7 @@ Expr> FoldIntrinsicFunction( context, std::move(funcRef), &Scalar::CONJG); } else if (name == "cmplx") { using Part = typename T::Part; - if (args.size() == 1) { + if (args.size() == 2) { // CMPLX(X, [KIND]) if (auto *x{UnwrapExpr>(args[0])}) { return Fold(context, ConvertToType(std::move(*x))); } @@ -46,7 +46,8 @@ Expr> FoldIntrinsicFunction( Expr{ComplexConstructor{ToReal(context, std::move(re)), ToReal(context, std::move(im))}}); } - CHECK(args.size() == 2 || args.size() == 3); + // CMPLX(X, [Y, KIND]) + CHECK(args.size() == 3); Expr re{std::move(*args[0].value().UnwrapExpr())}; Expr im{args[1] ? std::move(*args[1].value().UnwrapExpr()) : AsGenericExpr(Constant{Scalar{}})}; diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h index 85e35613d6404..ebe826fcbc126 100644 --- a/flang/lib/Evaluate/fold-implementation.h +++ b/flang/lib/Evaluate/fold-implementation.h @@ -22,6 +22,7 @@ #include "flang/Evaluate/expression.h" #include "flang/Evaluate/fold.h" #include "flang/Evaluate/formatting.h" +#include "flang/Evaluate/intrinsics.h" #include "flang/Evaluate/shape.h" #include "flang/Evaluate/tools.h" #include "flang/Evaluate/traverse.h" @@ -600,9 +601,9 @@ std::optional> GetIntegerVector(const B &x) { // gets re-folded. template Expr MakeInvalidIntrinsic(FunctionRef &&funcRef) { SpecificIntrinsic invalid{std::get(funcRef.proc().u)}; - invalid.name = "(invalid intrinsic function call)"; + invalid.name = IntrinsicProcTable::InvalidName; return Expr{FunctionRef{ProcedureDesignator{std::move(invalid)}, - ActualArguments{ActualArgument{AsGenericExpr(std::move(funcRef))}}}}; + ActualArguments{std::move(funcRef.arguments())}}}; } template Expr Folder::Reshape(FunctionRef &&funcRef) { @@ -615,8 +616,13 @@ template Expr Folder::Reshape(FunctionRef &&funcRef) { std::optional> order{GetIntegerVector(args[3])}; if (!source || !shape || (args[2] && !pad) || (args[3] && !order)) { return Expr{std::move(funcRef)}; // Non-constant arguments - } else if (!IsValidShape(shape.value())) { - context_.messages().Say("Invalid SHAPE in RESHAPE"_en_US); + } else if (shape.value().size() > common::maxRank) { + context_.messages().Say( + "Size of 'shape=' argument must not be greater than %d"_err_en_US, + common::maxRank); + } else if (HasNegativeExtent(shape.value())) { + context_.messages().Say( + "'shape=' argument must not have a negative extent"_err_en_US); } else { int rank{GetRank(shape.value())}; std::size_t resultElements{TotalElementCount(shape.value())}; @@ -626,12 +632,13 @@ template Expr Folder::Reshape(FunctionRef &&funcRef) { } std::vector *dimOrderPtr{dimOrder ? &dimOrder.value() : nullptr}; if (order && !dimOrder) { - context_.messages().Say("Invalid ORDER in RESHAPE"_en_US); + context_.messages().Say("Invalid 'order=' argument in RESHAPE"_err_en_US); } else if (resultElements > source->size() && (!pad || pad->empty())) { - context_.messages().Say("Too few SOURCE elements in RESHAPE and PAD" - "is not present or has null size"_en_US); + context_.messages().Say( + "Too few elements in 'source=' argument and 'pad=' " + "argument is not present or has null size"_err_en_US); } else { - Constant result{!source->empty() + Constant result{!source->empty() || !pad ? source->Reshape(std::move(shape.value())) : pad->Reshape(std::move(shape.value()))}; ConstantSubscripts subscripts{result.lbounds()}; diff --git a/flang/lib/Evaluate/host.h b/flang/lib/Evaluate/host.h index 2fac0424f17cc..1fc2423f4f0c3 100644 --- a/flang/lib/Evaluate/host.h +++ b/flang/lib/Evaluate/host.h @@ -140,14 +140,14 @@ template <> struct HostTypeHelper> { // It should be defined when gcc/clang have a better support for it. template <> struct HostTypeHelper> { - // IEE 754 64bits + // IEEE 754 64bits using Type = std::conditional_t::is_iec559, float, UnsupportedType>; }; template <> struct HostTypeHelper> { - // IEE 754 64bits + // IEEE 754 64bits using Type = std::conditional_t::is_iec559, double, UnsupportedType>; @@ -162,7 +162,7 @@ template <> struct HostTypeHelper> { }; template <> struct HostTypeHelper> { - // IEE 754 128bits + // IEEE 754 128bits using Type = std::conditional_t::digits == 113 && std::numeric_limits::max_exponent == 16384, diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 6237499d12ef7..ca9a91c98dafe 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -1129,9 +1129,10 @@ std::optional IntrinsicInterface::Match( d.rank == Rank::elementalOrBOZ) { continue; } else { + const IntrinsicDummyArgument &nextParam{dummy[j + 1]}; messages.Say( - "Typeless (BOZ) not allowed for '%s=' argument"_err_en_US, - d.keyword); + "Typeless (BOZ) not allowed for both '%s=' & '%s=' arguments"_err_en_US, // C7109 + d.keyword, nextParam.keyword); } } else { // NULL(), procedure, or procedure pointer @@ -1258,7 +1259,11 @@ std::optional IntrinsicInterface::Match( break; case Rank::shape: CHECK(!shapeArgSize); - if (rank == 1) { + if (rank != 1) { + messages.Say( + "'shape=' argument must be an array of rank 1"_err_en_US); + return std::nullopt; + } else { if (auto shape{GetShape(context, *arg)}) { if (auto constShape{AsConstantShape(context, *shape)}) { shapeArgSize = constShape->At(ConstantSubscripts{1}).ToInt64(); @@ -1568,6 +1573,7 @@ class IntrinsicProcTable::Implementation { bool IsIntrinsic(const std::string &) const; IntrinsicClass GetIntrinsicClass(const std::string &) const; + std::string GetGenericIntrinsicName(const std::string &) const; std::optional Probe(const CallCharacteristics &, ActualArguments &, FoldingContext &, const IntrinsicProcTable &) const; @@ -1625,6 +1631,17 @@ IntrinsicClass IntrinsicProcTable::Implementation::GetIntrinsicClass( return IntrinsicClass::noClass; } +std::string IntrinsicProcTable::Implementation::GetGenericIntrinsicName( + const std::string &name) const { + auto specificIntrinsic{specificFuncs_.find(name)}; + if (specificIntrinsic != specificFuncs_.end()) { + if (const char *genericName{specificIntrinsic->second->generic}) { + return {genericName}; + } + } + return name; +} + bool CheckAndRearrangeArguments(ActualArguments &arguments, parser::ContextualMessages &messages, const char *const dummyKeywords[], std::size_t trailingOptionals) { @@ -2075,6 +2092,11 @@ IntrinsicClass IntrinsicProcTable::GetIntrinsicClass( return DEREF(impl_).GetIntrinsicClass(name); } +std::string IntrinsicProcTable::GetGenericIntrinsicName( + const std::string &name) const { + return DEREF(impl_).GetGenericIntrinsicName(name); +} + std::optional IntrinsicProcTable::Probe( const CallCharacteristics &call, ActualArguments &arguments, FoldingContext &context) const { diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp index bc7cd27df320e..017187069dcbb 100644 --- a/flang/lib/Lower/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP.cpp @@ -75,6 +75,36 @@ genOMP(Fortran::lower::AbstractConverter &absConv, standaloneConstruct.u); } +static void +genOMP(Fortran::lower::AbstractConverter &absConv, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPBlockConstruct &blockConstruct) { + const auto &blockDirective = + std::get(blockConstruct.t); + const auto ¶llelDirective = + std::get(blockDirective.t); + if (parallelDirective.v == llvm::omp::OMPD_parallel) { + auto &firOpBuilder = absConv.getFirOpBuilder(); + auto currentLocation = absConv.getCurrentLocation(); + auto insertPt = firOpBuilder.saveInsertionPoint(); + llvm::ArrayRef argTy; + mlir::ValueRange range; + llvm::SmallVector operandSegmentSizes(6 /*Size=*/, + 0 /*Value=*/); + // create and insert the operation. + auto parallelOp = firOpBuilder.create( + currentLocation, argTy, range); + parallelOp.setAttr(mlir::omp::ParallelOp::getOperandSegmentSizeAttr(), + firOpBuilder.getI32VectorAttr(operandSegmentSizes)); + parallelOp.getRegion().push_back(new Block{}); + auto &block = parallelOp.getRegion().back(); + firOpBuilder.setInsertionPointToStart(&block); + // ensure the block is well-formed. + firOpBuilder.create(currentLocation); + firOpBuilder.restoreInsertionPoint(insertPt); + } +} + void Fortran::lower::genOpenMPConstruct( Fortran::lower::AbstractConverter &absConv, Fortran::lower::pft::Evaluation &eval, @@ -92,7 +122,7 @@ void Fortran::lower::genOpenMPConstruct( TODO(); }, [&](const Fortran::parser::OpenMPBlockConstruct &blockConstruct) { - TODO(); + genOMP(absConv, eval, blockConstruct); }, [&](const Fortran::parser::OpenMPAtomicConstruct &atomicConstruct) { TODO(); diff --git a/flang/lib/Parser/preprocessor.cpp b/flang/lib/Parser/preprocessor.cpp index a1f07967d9b08..9c10cedfd53bb 100644 --- a/flang/lib/Parser/preprocessor.cpp +++ b/flang/lib/Parser/preprocessor.cpp @@ -212,7 +212,7 @@ Preprocessor::Preprocessor(AllSources &allSources) : allSources_{allSources} { } void Preprocessor::Define(std::string macro, std::string value) { - definitions_.emplace(SaveTokenAsName(macro), Definition{value, allSources_}); + definitions_.emplace(macro, Definition{value, allSources_}); } void Preprocessor::Undefine(std::string macro) { definitions_.erase(macro); } diff --git a/flang/lib/Parser/preprocessor.h b/flang/lib/Parser/preprocessor.h index 977d480c2482e..b4374a9acf5c1 100644 --- a/flang/lib/Parser/preprocessor.h +++ b/flang/lib/Parser/preprocessor.h @@ -67,6 +67,7 @@ class Preprocessor { void Define(std::string macro, std::string value); void Undefine(std::string macro); + bool IsNameDefined(const CharBlock &); std::optional MacroReplacement( const TokenSequence &, const Prescanner &); @@ -79,7 +80,6 @@ class Preprocessor { enum class CanDeadElseAppear { No, Yes }; CharBlock SaveTokenAsName(const CharBlock &); - bool IsNameDefined(const CharBlock &); TokenSequence ReplaceMacros(const TokenSequence &, const Prescanner &); void SkipDisabledConditionalCode( const std::string &, IsElseActive, Prescanner *, ProvenanceRange); diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index c81d6cb508464..face0d20e4fc0 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -246,7 +246,7 @@ void Prescanner::NextLine() { } void Prescanner::LabelField(TokenSequence &token, int outCol) { - bool badLabel{false}; + const char *bad{nullptr}; for (; *at_ != '\n' && column_ <= 6; ++at_) { if (*at_ == '\t') { ++at_; @@ -256,16 +256,18 @@ void Prescanner::LabelField(TokenSequence &token, int outCol) { if (*at_ != ' ' && !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space EmitChar(token, *at_); - if (!IsDecimalDigit(*at_) && !badLabel) { - Say(GetProvenance(at_), - "Character in fixed-form label field must be a digit"_en_US); - badLabel = true; + if (!bad && !IsDecimalDigit(*at_)) { + bad = at_; } ++outCol; } ++column_; } if (outCol > 1) { + if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) { + Say(GetProvenance(bad), + "Character in fixed-form label field must be a digit"_en_US); + } token.CloseToken(); } SkipToNextSignificantCharacter(); @@ -1098,6 +1100,15 @@ const char *Prescanner::IsCompilerDirectiveSentinel( return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str(); } +constexpr bool IsDirective(const char *match, const char *dir) { + for (; *match; ++match) { + if (*match != ToLowerCaseLetter(*dir++)) { + return false; + } + } + return true; +} + Prescanner::LineClassification Prescanner::ClassifyLine( const char *start) const { if (inFixedForm_) { @@ -1122,13 +1133,12 @@ Prescanner::LineClassification Prescanner::ClassifyLine( return {LineClassification::Kind::IncludeLine, *quoteOffset}; } if (const char *dir{IsPreprocessorDirectiveLine(start)}) { - if (std::memcmp(dir, "if", 2) == 0 || std::memcmp(dir, "elif", 4) == 0 || - std::memcmp(dir, "else", 4) == 0 || std::memcmp(dir, "endif", 5) == 0) { + if (IsDirective("if", dir) || IsDirective("elif", dir) || + IsDirective("else", dir) || IsDirective("endif", dir)) { return {LineClassification::Kind::ConditionalCompilationDirective}; - } else if (std::memcmp(dir, "include", 7) == 0) { + } else if (IsDirective("include", dir)) { return {LineClassification::Kind::IncludeDirective}; - } else if (std::memcmp(dir, "define", 6) == 0 || - std::memcmp(dir, "undef", 5) == 0) { + } else if (IsDirective("define", dir) || IsDirective("undef", dir)) { return {LineClassification::Kind::DefinitionDirective}; } else { return {LineClassification::Kind::PreprocessorDirective}; diff --git a/flang/lib/Semantics/CMakeLists.txt b/flang/lib/Semantics/CMakeLists.txt index 2bdc5f9582819..5d84061437728 100644 --- a/flang/lib/Semantics/CMakeLists.txt +++ b/flang/lib/Semantics/CMakeLists.txt @@ -31,8 +31,9 @@ add_flang_library(FortranSemantics pointer-assignment.cpp program-tree.cpp resolve-labels.cpp - resolve-names.cpp + resolve-directives.cpp resolve-names-utils.cpp + resolve-names.cpp rewrite-parse-tree.cpp scope.cpp semantics.cpp diff --git a/flang/lib/Semantics/check-select-type.cpp b/flang/lib/Semantics/check-select-type.cpp index 5b430440dffb1..ce675fa2f1dbe 100644 --- a/flang/lib/Semantics/check-select-type.cpp +++ b/flang/lib/Semantics/check-select-type.cpp @@ -39,7 +39,7 @@ class TypeCaseValues { if (std::holds_alternative(guard.u)) { typeCases_.emplace_back(stmt, std::nullopt); } else if (std::optional type{GetGuardType(guard)}) { - if (PassesChecksOnGuard(guard, *type)) { + if (PassesChecksOnGuard(stmt, *type)) { typeCases_.emplace_back(stmt, *type); } else { hasErrors_ = true; @@ -71,35 +71,46 @@ class TypeCaseValues { guard.u); } - bool PassesChecksOnGuard(const parser::TypeGuardStmt::Guard &guard, + bool PassesChecksOnGuard(const parser::Statement &stmt, const evaluate::DynamicType &guardDynamicType) { + const parser::TypeGuardStmt &typeGuardStmt{stmt.statement}; + const auto &guard{std::get(typeGuardStmt.t)}; return std::visit( common::visitors{ [](const parser::Default &) { return true; }, [&](const parser::TypeSpec &typeSpec) { - if (const DeclTypeSpec * spec{typeSpec.declTypeSpec}) { + const DeclTypeSpec *spec{typeSpec.declTypeSpec}; + CHECK(spec); + CHECK(spec->AsIntrinsic() || spec->AsDerived()); + bool typeSpecRetVal{false}; + if (spec->AsIntrinsic()) { + typeSpecRetVal = true; + if (!selectorType_.IsUnlimitedPolymorphic()) { // C1162 + context_.Say(stmt.source, + "If selector is not unlimited polymorphic, " + "an intrinsic type specification must not be specified " + "in the type guard statement"_err_en_US); + typeSpecRetVal = false; + } if (spec->category() == DeclTypeSpec::Character && !guardDynamicType.IsAssumedLengthCharacter()) { // C1160 context_.Say(parser::FindSourceLocation(typeSpec), "The type specification statement must have " "LEN type parameter as assumed"_err_en_US); - return false; + typeSpecRetVal = false; } - if (const DerivedTypeSpec * derived{spec->AsDerived()}) { - return PassesDerivedTypeChecks( - *derived, parser::FindSourceLocation(typeSpec)); - } - return false; + } else { + const DerivedTypeSpec *derived{spec->AsDerived()}; + typeSpecRetVal = PassesDerivedTypeChecks( + *derived, parser::FindSourceLocation(typeSpec)); } - return false; + return typeSpecRetVal; }, [&](const parser::DerivedTypeSpec &x) { - if (const semantics::DerivedTypeSpec * - derived{x.derivedTypeSpec}) { - return PassesDerivedTypeChecks( - *derived, parser::FindSourceLocation(x)); - } - return false; + CHECK(x.derivedTypeSpec); + const semantics::DerivedTypeSpec *derived{x.derivedTypeSpec}; + return PassesDerivedTypeChecks( + *derived, parser::FindSourceLocation(x)); }, }, guard.u); diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp index bcc4c0caf46a5..8b24f6a1f3eae 100644 --- a/flang/lib/Semantics/compute-offsets.cpp +++ b/flang/lib/Semantics/compute-offsets.cpp @@ -257,7 +257,7 @@ auto ComputeOffsetsHelper::GetElementSize(const Symbol &symbol) // TODO: The size of procedure pointers is not yet known // and is independent of rank (and probably also the number // of length type parameters). - if (IsDescriptor(symbol) || IsProcedure(symbol)) { + if (IsDescriptor(symbol) || IsProcedurePointer(symbol)) { int lenParams{0}; if (const DerivedTypeSpec * derived{type->AsDerived()}) { lenParams = CountLenParameters(*derived); @@ -266,6 +266,9 @@ auto ComputeOffsetsHelper::GetElementSize(const Symbol &symbol) runtime::Descriptor::SizeInBytes(symbol.Rank(), false, lenParams)}; return {size, maxAlignment}; } + if (IsProcedure(symbol)) { + return {}; + } SizeAndAlignment result; if (const IntrinsicTypeSpec * intrinsic{type->AsIntrinsic()}) { if (auto kind{ToInt64(intrinsic->kind())}) { diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index e78c2d20edcf7..0a6e448e98a22 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -633,7 +633,8 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::BOZLiteralConstant &x) { ++p; auto value{BOZLiteralConstant::Read(p, base, false /*unsigned*/)}; if (*p != '"') { - Say("Invalid digit ('%c') in BOZ literal '%s'"_err_en_US, *p, x.v); + Say("Invalid digit ('%c') in BOZ literal '%s'"_err_en_US, *p, + x.v); // C7107, C7108 return std::nullopt; } if (value.overflow) { @@ -1202,6 +1203,7 @@ class ArrayConstructorContext { bool explicitType_{type_.has_value()}; std::optional constantLength_; ArrayConstructorValues values_; + bool messageDisplayedOnce{false}; }; void ArrayConstructorContext::Push(MaybeExpr &&x) { @@ -1252,17 +1254,21 @@ void ArrayConstructorContext::Push(MaybeExpr &&x) { } } } else { - exprAnalyzer_.Say( - "Values in array constructor must have the same declared type " - "when no explicit type appears"_err_en_US); + if (!messageDisplayedOnce) { + exprAnalyzer_.Say( + "Values in array constructor must have the same declared type " + "when no explicit type appears"_err_en_US); // C7110 + messageDisplayedOnce = true; + } } } else { if (auto cast{ConvertToType(*type_, std::move(*x))}) { values_.Push(std::move(*cast)); } else { exprAnalyzer_.Say( - "Value in array constructor could not be converted to the type " - "of the array"_err_en_US); + "Value in array constructor of type '%s' could not " + "be converted to the type of the array '%s'"_err_en_US, + x->GetType()->AsFortran(), type_->AsFortran()); // C7111, C7112 } } } @@ -1304,7 +1310,7 @@ void ArrayConstructorContext::Add(const parser::AcValue &x) { if (exprType->IsUnlimitedPolymorphic()) { exprAnalyzer_.Say( "Cannot have an unlimited polymorphic value in an " - "array constructor"_err_en_US); + "array constructor"_err_en_US); // C7113 } } Push(std::move(*v)); @@ -1346,7 +1352,7 @@ void ArrayConstructorContext::Add(const parser::AcValue &x) { } else { exprAnalyzer_.SayAt(name, "Implied DO index is active in surrounding implied DO loop " - "and may not have the same name"_err_en_US); + "and may not have the same name"_err_en_US); // C7115 } }, }, @@ -1386,7 +1392,7 @@ MaybeExpr ExpressionAnalyzer::Analyze( "ABSTRACT derived type '%s' may not be used in a " "structure constructor"_err_en_US, typeName), - typeSymbol); + typeSymbol); // C7114 } // This iterator traverses all of the components in the derived type and its diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp new file mode 100644 index 0000000000000..d05eb5385ba87 --- /dev/null +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -0,0 +1,1004 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "resolve-directives.h" + +#include "check-acc-structure.h" +#include "check-omp-structure.h" +#include "resolve-names-utils.h" +#include "flang/Common/idioms.h" +#include "flang/Evaluate/fold.h" +#include "flang/Parser/parse-tree-visitor.h" +#include "flang/Parser/parse-tree.h" +#include "flang/Parser/tools.h" +#include "flang/Semantics/expression.h" +#include +#include + +namespace Fortran::semantics { + +template class DirectiveAttributeVisitor { +public: + explicit DirectiveAttributeVisitor(SemanticsContext &context) + : context_{context} {} + + template bool Pre(const A &) { return true; } + template void Post(const A &) {} + +protected: + struct DirContext { + DirContext(const parser::CharBlock &source, T d, Scope &s) + : directiveSource{source}, directive{d}, scope{s} {} + parser::CharBlock directiveSource; + T directive; + Scope &scope; + Symbol::Flag defaultDSA{Symbol::Flag::AccShared}; // TODOACC + std::map objectWithDSA; + bool withinConstruct{false}; + std::int64_t associatedLoopLevel{0}; + }; + + DirContext &GetContext() { + CHECK(!dirContext_.empty()); + return dirContext_.back(); + } + void PushContext(const parser::CharBlock &source, T dir) { + dirContext_.emplace_back(source, dir, context_.FindScope(source)); + } + void PopContext() { dirContext_.pop_back(); } + void SetContextDirectiveSource(parser::CharBlock &dir) { + GetContext().directiveSource = dir; + } + Scope &currScope() { return GetContext().scope; } + void SetContextDefaultDSA(Symbol::Flag flag) { + GetContext().defaultDSA = flag; + } + void AddToContextObjectWithDSA( + const Symbol &symbol, Symbol::Flag flag, DirContext &context) { + context.objectWithDSA.emplace(&symbol, flag); + } + void AddToContextObjectWithDSA(const Symbol &symbol, Symbol::Flag flag) { + AddToContextObjectWithDSA(symbol, flag, GetContext()); + } + bool IsObjectWithDSA(const Symbol &symbol) { + auto it{GetContext().objectWithDSA.find(&symbol)}; + return it != GetContext().objectWithDSA.end(); + } + void SetContextAssociatedLoopLevel(std::int64_t level) { + GetContext().associatedLoopLevel = level; + } + Symbol &MakeAssocSymbol(const SourceName &name, Symbol &prev, Scope &scope) { + const auto pair{scope.try_emplace(name, Attrs{}, HostAssocDetails{prev})}; + return *pair.first->second; + } + Symbol &MakeAssocSymbol(const SourceName &name, Symbol &prev) { + return MakeAssocSymbol(name, prev, currScope()); + } + static const parser::Name *GetDesignatorNameIfDataRef( + const parser::Designator &designator) { + const auto *dataRef{std::get_if(&designator.u)}; + return dataRef ? std::get_if(&dataRef->u) : nullptr; + } + void AddDataSharingAttributeObject(SymbolRef object) { + dataSharingAttributeObjects_.insert(object); + } + void ClearDataSharingAttributeObjects() { + dataSharingAttributeObjects_.clear(); + } + bool HasDataSharingAttributeObject(const Symbol &); + const parser::Name &GetLoopIndex(const parser::DoConstruct &); + const parser::DoConstruct *GetDoConstructIf( + const parser::ExecutionPartConstruct &); + Symbol *DeclarePrivateAccessEntity( + const parser::Name &, Symbol::Flag, Scope &); + Symbol *DeclarePrivateAccessEntity(Symbol &, Symbol::Flag, Scope &); + Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); + + SymbolSet dataSharingAttributeObjects_; // on one directive + SemanticsContext &context_; + std::vector dirContext_; // used as a stack +}; + +class AccAttributeVisitor : DirectiveAttributeVisitor { +public: + explicit AccAttributeVisitor(SemanticsContext &context) + : DirectiveAttributeVisitor(context) {} + + template void Walk(const A &x) { parser::Walk(x, *this); } + template bool Pre(const A &) { return true; } + template void Post(const A &) {} + + bool Pre(const parser::SpecificationPart &x) { + Walk(std::get>(x.t)); + return false; + } + + bool Pre(const parser::OpenACCBlockConstruct &); + void Post(const parser::OpenACCBlockConstruct &) { PopContext(); } + bool Pre(const parser::OpenACCCombinedConstruct &); + void Post(const parser::OpenACCCombinedConstruct &) { PopContext(); } + + void Post(const parser::AccBeginBlockDirective &) { + GetContext().withinConstruct = true; + } + + bool Pre(const parser::OpenACCLoopConstruct &); + void Post(const parser::OpenACCLoopConstruct &) { PopContext(); } + void Post(const parser::AccLoopDirective &) { + GetContext().withinConstruct = true; + } + + bool Pre(const parser::OpenACCStandaloneConstruct &); + void Post(const parser::OpenACCStandaloneConstruct &) { PopContext(); } + void Post(const parser::AccStandaloneDirective &) { + GetContext().withinConstruct = true; + } + + void Post(const parser::AccDefaultClause &); + + bool Pre(const parser::AccClause::Copy &x) { + ResolveAccObjectList(x.v, Symbol::Flag::AccCopyIn); + ResolveAccObjectList(x.v, Symbol::Flag::AccCopyOut); + return false; + } + + bool Pre(const parser::AccClause::Create &x) { + const auto &objectList{std::get(x.v.t)}; + ResolveAccObjectList(objectList, Symbol::Flag::AccCreate); + return false; + } + + bool Pre(const parser::AccClause::Copyin &x) { + const auto &objectList{std::get(x.v.t)}; + ResolveAccObjectList(objectList, Symbol::Flag::AccCopyIn); + return false; + } + + bool Pre(const parser::AccClause::Copyout &x) { + const auto &objectList{std::get(x.v.t)}; + ResolveAccObjectList(objectList, Symbol::Flag::AccCopyOut); + return false; + } + + bool Pre(const parser::AccClause::Present &x) { + ResolveAccObjectList(x.v, Symbol::Flag::AccPresent); + return false; + } + bool Pre(const parser::AccClause::Private &x) { + ResolveAccObjectList(x.v, Symbol::Flag::AccPrivate); + return false; + } + bool Pre(const parser::AccClause::FirstPrivate &x) { + ResolveAccObjectList(x.v, Symbol::Flag::AccFirstPrivate); + return false; + } + + void Post(const parser::Name &); + +private: + std::int64_t GetAssociatedLoopLevelFromClauses(const parser::AccClauseList &); + + static constexpr Symbol::Flags dataSharingAttributeFlags{ + Symbol::Flag::AccShared, Symbol::Flag::AccPrivate, + Symbol::Flag::AccPresent, Symbol::Flag::AccFirstPrivate, + Symbol::Flag::AccReduction}; + + static constexpr Symbol::Flags dataMappingAttributeFlags{ + Symbol::Flag::AccCreate, Symbol::Flag::AccCopyIn, + Symbol::Flag::AccCopyOut, Symbol::Flag::AccDelete}; + + static constexpr Symbol::Flags accFlagsRequireNewSymbol{ + Symbol::Flag::AccPrivate, Symbol::Flag::AccFirstPrivate, + Symbol::Flag::AccReduction}; + + static constexpr Symbol::Flags accFlagsRequireMark{}; + + void PrivatizeAssociatedLoopIndex(const parser::OpenACCLoopConstruct &); + void ResolveAccObjectList(const parser::AccObjectList &, Symbol::Flag); + void ResolveAccObject(const parser::AccObject &, Symbol::Flag); + Symbol *ResolveAcc(const parser::Name &, Symbol::Flag, Scope &); + Symbol *ResolveAcc(Symbol &, Symbol::Flag, Scope &); + Symbol *ResolveAccCommonBlockName(const parser::Name *); + Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); + Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag); + void CheckMultipleAppearances( + const parser::Name &, const Symbol &, Symbol::Flag); +}; + +// Data-sharing and Data-mapping attributes for data-refs in OpenMP construct +class OmpAttributeVisitor : DirectiveAttributeVisitor { +public: + explicit OmpAttributeVisitor(SemanticsContext &context) + : DirectiveAttributeVisitor(context) {} + + template void Walk(const A &x) { parser::Walk(x, *this); } + template bool Pre(const A &) { return true; } + template void Post(const A &) {} + + bool Pre(const parser::SpecificationPart &x) { + Walk(std::get>(x.t)); + return false; + } + + bool Pre(const parser::OpenMPBlockConstruct &); + void Post(const parser::OpenMPBlockConstruct &) { PopContext(); } + void Post(const parser::OmpBeginBlockDirective &) { + GetContext().withinConstruct = true; + } + + bool Pre(const parser::OpenMPLoopConstruct &); + void Post(const parser::OpenMPLoopConstruct &) { PopContext(); } + void Post(const parser::OmpBeginLoopDirective &) { + GetContext().withinConstruct = true; + } + bool Pre(const parser::DoConstruct &); + + bool Pre(const parser::OpenMPSectionsConstruct &); + void Post(const parser::OpenMPSectionsConstruct &) { PopContext(); } + + bool Pre(const parser::OpenMPThreadprivate &); + void Post(const parser::OpenMPThreadprivate &) { PopContext(); } + + // 2.15.3 Data-Sharing Attribute Clauses + void Post(const parser::OmpDefaultClause &); + bool Pre(const parser::OmpClause::Shared &x) { + ResolveOmpObjectList(x.v, Symbol::Flag::OmpShared); + return false; + } + bool Pre(const parser::OmpClause::Private &x) { + ResolveOmpObjectList(x.v, Symbol::Flag::OmpPrivate); + return false; + } + bool Pre(const parser::OmpClause::Firstprivate &x) { + ResolveOmpObjectList(x.v, Symbol::Flag::OmpFirstPrivate); + return false; + } + bool Pre(const parser::OmpClause::Lastprivate &x) { + ResolveOmpObjectList(x.v, Symbol::Flag::OmpLastPrivate); + return false; + } + + void Post(const parser::Name &); + +private: + std::int64_t GetAssociatedLoopLevelFromClauses(const parser::OmpClauseList &); + + static constexpr Symbol::Flags dataSharingAttributeFlags{ + Symbol::Flag::OmpShared, Symbol::Flag::OmpPrivate, + Symbol::Flag::OmpFirstPrivate, Symbol::Flag::OmpLastPrivate, + Symbol::Flag::OmpReduction, Symbol::Flag::OmpLinear}; + + static constexpr Symbol::Flags ompFlagsRequireNewSymbol{ + Symbol::Flag::OmpPrivate, Symbol::Flag::OmpLinear, + Symbol::Flag::OmpFirstPrivate, Symbol::Flag::OmpLastPrivate, + Symbol::Flag::OmpReduction}; + + static constexpr Symbol::Flags ompFlagsRequireMark{ + Symbol::Flag::OmpThreadprivate}; + + // Predetermined DSA rules + void PrivatizeAssociatedLoopIndex(const parser::OpenMPLoopConstruct &); + void ResolveSeqLoopIndexInParallelOrTaskConstruct(const parser::Name &); + + void ResolveOmpObjectList(const parser::OmpObjectList &, Symbol::Flag); + void ResolveOmpObject(const parser::OmpObject &, Symbol::Flag); + Symbol *ResolveOmp(const parser::Name &, Symbol::Flag, Scope &); + Symbol *ResolveOmp(Symbol &, Symbol::Flag, Scope &); + Symbol *ResolveOmpCommonBlockName(const parser::Name *); + Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); + Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag); + void CheckMultipleAppearances( + const parser::Name &, const Symbol &, Symbol::Flag); +}; + +template +bool DirectiveAttributeVisitor::HasDataSharingAttributeObject( + const Symbol &object) { + auto it{dataSharingAttributeObjects_.find(object)}; + return it != dataSharingAttributeObjects_.end(); +} + +template +const parser::Name &DirectiveAttributeVisitor::GetLoopIndex( + const parser::DoConstruct &x) { + using Bounds = parser::LoopControl::Bounds; + return std::get(x.GetLoopControl()->u).name.thing; +} + +template +const parser::DoConstruct *DirectiveAttributeVisitor::GetDoConstructIf( + const parser::ExecutionPartConstruct &x) { + return parser::Unwrap(x); +} + +template +Symbol *DirectiveAttributeVisitor::DeclarePrivateAccessEntity( + const parser::Name &name, Symbol::Flag flag, Scope &scope) { + if (!name.symbol) { + return nullptr; // not resolved by Name Resolution step, do nothing + } + name.symbol = DeclarePrivateAccessEntity(*name.symbol, flag, scope); + return name.symbol; +} + +template +Symbol *DirectiveAttributeVisitor::DeclarePrivateAccessEntity( + Symbol &object, Symbol::Flag flag, Scope &scope) { + if (object.owner() != currScope()) { + auto &symbol{MakeAssocSymbol(object.name(), object, scope)}; + symbol.set(flag); + return &symbol; + } else { + object.set(flag); + return &object; + } +} + +bool AccAttributeVisitor::Pre(const parser::OpenACCBlockConstruct &x) { + const auto &beginBlockDir{std::get(x.t)}; + const auto &blockDir{std::get(beginBlockDir.t)}; + switch (blockDir.v) { + case llvm::acc::Directive::ACCD_data: + case llvm::acc::Directive::ACCD_host_data: + case llvm::acc::Directive::ACCD_kernels: + case llvm::acc::Directive::ACCD_parallel: + case llvm::acc::Directive::ACCD_serial: + PushContext(blockDir.source, blockDir.v); + break; + default: + break; + } + ClearDataSharingAttributeObjects(); + return true; +} + +bool AccAttributeVisitor::Pre(const parser::OpenACCLoopConstruct &x) { + const auto &beginDir{std::get(x.t)}; + const auto &loopDir{std::get(beginDir.t)}; + const auto &clauseList{std::get(beginDir.t)}; + if (loopDir.v == llvm::acc::Directive::ACCD_loop) { + PushContext(loopDir.source, loopDir.v); + } + ClearDataSharingAttributeObjects(); + SetContextAssociatedLoopLevel(GetAssociatedLoopLevelFromClauses(clauseList)); + PrivatizeAssociatedLoopIndex(x); + return true; +} + +bool AccAttributeVisitor::Pre(const parser::OpenACCStandaloneConstruct &x) { + const auto &standaloneDir{std::get(x.t)}; + switch (standaloneDir.v) { + case llvm::acc::Directive::ACCD_cache: + case llvm::acc::Directive::ACCD_enter_data: + case llvm::acc::Directive::ACCD_exit_data: + case llvm::acc::Directive::ACCD_init: + case llvm::acc::Directive::ACCD_set: + case llvm::acc::Directive::ACCD_shutdown: + case llvm::acc::Directive::ACCD_update: + PushContext(standaloneDir.source, standaloneDir.v); + break; + default: + break; + } + ClearDataSharingAttributeObjects(); + return true; +} + +bool AccAttributeVisitor::Pre(const parser::OpenACCCombinedConstruct &x) { + const auto &beginBlockDir{std::get(x.t)}; + const auto &combinedDir{ + std::get(beginBlockDir.t)}; + switch (combinedDir.v) { + case llvm::acc::Directive::ACCD_kernels_loop: + case llvm::acc::Directive::ACCD_parallel_loop: + case llvm::acc::Directive::ACCD_serial_loop: + PushContext(combinedDir.source, combinedDir.v); + break; + default: + break; + } + ClearDataSharingAttributeObjects(); + return true; +} + +std::int64_t AccAttributeVisitor::GetAssociatedLoopLevelFromClauses( + const parser::AccClauseList &x) { + std::int64_t collapseLevel{0}; + for (const auto &clause : x.v) { + if (const auto *collapseClause{ + std::get_if(&clause.u)}) { + if (const auto v{EvaluateInt64(context_, collapseClause->v)}) { + collapseLevel = *v; + } + } + } + + if (collapseLevel) { + return collapseLevel; + } + return 1; // default is outermost loop +} + +void AccAttributeVisitor::PrivatizeAssociatedLoopIndex( + const parser::OpenACCLoopConstruct &x) { + std::int64_t level{GetContext().associatedLoopLevel}; + if (level <= 0) { // collpase value was negative or 0 + return; + } + Symbol::Flag ivDSA{Symbol::Flag::AccPrivate}; + + const auto &outer{std::get>(x.t)}; + for (const parser::DoConstruct *loop{&*outer}; loop && level > 0; --level) { + // go through all the nested do-loops and resolve index variables + const parser::Name &iv{GetLoopIndex(*loop)}; + if (auto *symbol{ResolveAcc(iv, ivDSA, currScope())}) { + symbol->set(Symbol::Flag::AccPreDetermined); + iv.symbol = symbol; // adjust the symbol within region + AddToContextObjectWithDSA(*symbol, ivDSA); + } + + const auto &block{std::get(loop->t)}; + const auto it{block.begin()}; + loop = it != block.end() ? GetDoConstructIf(*it) : nullptr; + } + CHECK(level == 0); +} + +void AccAttributeVisitor::Post(const parser::AccDefaultClause &x) { + if (!dirContext_.empty()) { + switch (x.v) { + case parser::AccDefaultClause::Arg::Present: + SetContextDefaultDSA(Symbol::Flag::AccPresent); + break; + case parser::AccDefaultClause::Arg::None: + SetContextDefaultDSA(Symbol::Flag::AccNone); + break; + } + } +} + +// For OpenACC constructs, check all the data-refs within the constructs +// and adjust the symbol for each Name if necessary +void AccAttributeVisitor::Post(const parser::Name &name) { + auto *symbol{name.symbol}; + if (symbol && !dirContext_.empty() && GetContext().withinConstruct) { + if (!symbol->owner().IsDerivedType() && !symbol->has() && + !IsObjectWithDSA(*symbol)) { + if (Symbol * found{currScope().FindSymbol(name.source)}) { + if (symbol != found) { + name.symbol = found; // adjust the symbol within region + } else if (GetContext().defaultDSA == Symbol::Flag::AccNone) { + // 2.5.14. + context_.Say(name.source, + "The DEFAULT(NONE) clause requires that '%s' must be listed in " + "a data-mapping clause"_err_en_US, + symbol->name()); + } + } + } + } // within OpenACC construct +} + +Symbol *AccAttributeVisitor::ResolveAccCommonBlockName( + const parser::Name *name) { + if (!name) { + return nullptr; + } else if (auto *prev{ + GetContext().scope.parent().FindCommonBlock(name->source)}) { + name->symbol = prev; + return prev; + } else { + return nullptr; + } +} + +void AccAttributeVisitor::ResolveAccObjectList( + const parser::AccObjectList &accObjectList, Symbol::Flag accFlag) { + for (const auto &accObject : accObjectList.v) { + ResolveAccObject(accObject, accFlag); + } +} + +void AccAttributeVisitor::ResolveAccObject( + const parser::AccObject &accObject, Symbol::Flag accFlag) { + std::visit( + common::visitors{ + [&](const parser::Designator &designator) { + if (const auto *name{GetDesignatorNameIfDataRef(designator)}) { + if (auto *symbol{ResolveAcc(*name, accFlag, currScope())}) { + AddToContextObjectWithDSA(*symbol, accFlag); + if (dataSharingAttributeFlags.test(accFlag)) { + CheckMultipleAppearances(*name, *symbol, accFlag); + } + } + } else { + // Array sections to be changed to substrings as needed + if (AnalyzeExpr(context_, designator)) { + if (std::holds_alternative(designator.u)) { + context_.Say(designator.source, + "Substrings are not allowed on OpenACC " + "directives or clauses"_err_en_US); + } + } + // other checks, more TBD + } + }, + [&](const parser::Name &name) { // common block + if (auto *symbol{ResolveAccCommonBlockName(&name)}) { + CheckMultipleAppearances( + name, *symbol, Symbol::Flag::AccCommonBlock); + for (auto &object : symbol->get().objects()) { + if (auto *resolvedObject{ + ResolveAcc(*object, accFlag, currScope())}) { + AddToContextObjectWithDSA(*resolvedObject, accFlag); + } + } + } else { + context_.Say(name.source, + "COMMON block must be declared in the same scoping unit " + "in which the OpenACC directive or clause appears"_err_en_US); + } + }, + }, + accObject.u); +} + +Symbol *AccAttributeVisitor::ResolveAcc( + const parser::Name &name, Symbol::Flag accFlag, Scope &scope) { + if (accFlagsRequireNewSymbol.test(accFlag)) { + return DeclarePrivateAccessEntity(name, accFlag, scope); + } else { + return DeclareOrMarkOtherAccessEntity(name, accFlag); + } +} + +Symbol *AccAttributeVisitor::ResolveAcc( + Symbol &symbol, Symbol::Flag accFlag, Scope &scope) { + if (accFlagsRequireNewSymbol.test(accFlag)) { + return DeclarePrivateAccessEntity(symbol, accFlag, scope); + } else { + return DeclareOrMarkOtherAccessEntity(symbol, accFlag); + } +} + +Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity( + const parser::Name &name, Symbol::Flag accFlag) { + Symbol *prev{currScope().FindSymbol(name.source)}; + if (!name.symbol || !prev) { + return nullptr; + } else if (prev != name.symbol) { + name.symbol = prev; + } + return DeclareOrMarkOtherAccessEntity(*prev, accFlag); +} + +Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity( + Symbol &object, Symbol::Flag accFlag) { + if (accFlagsRequireMark.test(accFlag)) { + object.set(accFlag); + } + return &object; +} + +static bool WithMultipleAppearancesAccException( + const Symbol &symbol, Symbol::Flag flag) { + return false; // Place holder +} + +void AccAttributeVisitor::CheckMultipleAppearances( + const parser::Name &name, const Symbol &symbol, Symbol::Flag accFlag) { + const auto *target{&symbol}; + if (accFlagsRequireNewSymbol.test(accFlag)) { + if (const auto *details{symbol.detailsIf()}) { + target = &details->symbol(); + } + } + if (HasDataSharingAttributeObject(*target) && + !WithMultipleAppearancesAccException(symbol, accFlag)) { + context_.Say(name.source, + "'%s' appears in more than one data-sharing clause " + "on the same OpenACC directive"_err_en_US, + name.ToString()); + } else { + AddDataSharingAttributeObject(*target); + } +} + +bool OmpAttributeVisitor::Pre(const parser::OpenMPBlockConstruct &x) { + const auto &beginBlockDir{std::get(x.t)}; + const auto &beginDir{std::get(beginBlockDir.t)}; + switch (beginDir.v) { + case llvm::omp::Directive::OMPD_master: + case llvm::omp::Directive::OMPD_ordered: + case llvm::omp::Directive::OMPD_parallel: + case llvm::omp::Directive::OMPD_single: + case llvm::omp::Directive::OMPD_target: + case llvm::omp::Directive::OMPD_target_data: + case llvm::omp::Directive::OMPD_task: + case llvm::omp::Directive::OMPD_teams: + case llvm::omp::Directive::OMPD_workshare: + case llvm::omp::Directive::OMPD_parallel_workshare: + case llvm::omp::Directive::OMPD_target_teams: + case llvm::omp::Directive::OMPD_target_parallel: + PushContext(beginDir.source, beginDir.v); + break; + default: + // TODO others + break; + } + ClearDataSharingAttributeObjects(); + return true; +} + +bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) { + const auto &beginLoopDir{std::get(x.t)}; + const auto &beginDir{std::get(beginLoopDir.t)}; + const auto &clauseList{std::get(beginLoopDir.t)}; + switch (beginDir.v) { + case llvm::omp::Directive::OMPD_distribute: + case llvm::omp::Directive::OMPD_distribute_parallel_do: + case llvm::omp::Directive::OMPD_distribute_parallel_do_simd: + case llvm::omp::Directive::OMPD_distribute_simd: + case llvm::omp::Directive::OMPD_do: + case llvm::omp::Directive::OMPD_do_simd: + case llvm::omp::Directive::OMPD_parallel_do: + case llvm::omp::Directive::OMPD_parallel_do_simd: + case llvm::omp::Directive::OMPD_simd: + case llvm::omp::Directive::OMPD_target_parallel_do: + case llvm::omp::Directive::OMPD_target_parallel_do_simd: + case llvm::omp::Directive::OMPD_target_teams_distribute: + case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do: + case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do_simd: + case llvm::omp::Directive::OMPD_target_teams_distribute_simd: + case llvm::omp::Directive::OMPD_target_simd: + case llvm::omp::Directive::OMPD_taskloop: + case llvm::omp::Directive::OMPD_taskloop_simd: + case llvm::omp::Directive::OMPD_teams_distribute: + case llvm::omp::Directive::OMPD_teams_distribute_parallel_do: + case llvm::omp::Directive::OMPD_teams_distribute_parallel_do_simd: + case llvm::omp::Directive::OMPD_teams_distribute_simd: + PushContext(beginDir.source, beginDir.v); + break; + default: + break; + } + ClearDataSharingAttributeObjects(); + SetContextAssociatedLoopLevel(GetAssociatedLoopLevelFromClauses(clauseList)); + PrivatizeAssociatedLoopIndex(x); + return true; +} + +void OmpAttributeVisitor::ResolveSeqLoopIndexInParallelOrTaskConstruct( + const parser::Name &iv) { + auto targetIt{dirContext_.rbegin()}; + for (;; ++targetIt) { + if (targetIt == dirContext_.rend()) { + return; + } + if (llvm::omp::parallelSet.test(targetIt->directive) || + llvm::omp::taskGeneratingSet.test(targetIt->directive)) { + break; + } + } + if (auto *symbol{ResolveOmp(iv, Symbol::Flag::OmpPrivate, targetIt->scope)}) { + targetIt++; + symbol->set(Symbol::Flag::OmpPreDetermined); + iv.symbol = symbol; // adjust the symbol within region + for (auto it{dirContext_.rbegin()}; it != targetIt; ++it) { + AddToContextObjectWithDSA(*symbol, Symbol::Flag::OmpPrivate, *it); + } + } +} + +// 2.15.1.1 Data-sharing Attribute Rules - Predetermined +// - A loop iteration variable for a sequential loop in a parallel +// or task generating construct is private in the innermost such +// construct that encloses the loop +bool OmpAttributeVisitor::Pre(const parser::DoConstruct &x) { + if (!dirContext_.empty() && GetContext().withinConstruct) { + if (const auto &iv{GetLoopIndex(x)}; iv.symbol) { + if (!iv.symbol->test(Symbol::Flag::OmpPreDetermined)) { + ResolveSeqLoopIndexInParallelOrTaskConstruct(iv); + } else { + // TODO: conflict checks with explicitly determined DSA + } + } + } + return true; +} + +std::int64_t OmpAttributeVisitor::GetAssociatedLoopLevelFromClauses( + const parser::OmpClauseList &x) { + std::int64_t orderedLevel{0}; + std::int64_t collapseLevel{0}; + for (const auto &clause : x.v) { + if (const auto *orderedClause{ + std::get_if(&clause.u)}) { + if (const auto v{EvaluateInt64(context_, orderedClause->v)}) { + orderedLevel = *v; + } + } + if (const auto *collapseClause{ + std::get_if(&clause.u)}) { + if (const auto v{EvaluateInt64(context_, collapseClause->v)}) { + collapseLevel = *v; + } + } + } + + if (orderedLevel && (!collapseLevel || orderedLevel >= collapseLevel)) { + return orderedLevel; + } else if (!orderedLevel && collapseLevel) { + return collapseLevel; + } // orderedLevel < collapseLevel is an error handled in structural checks + return 1; // default is outermost loop +} + +// 2.15.1.1 Data-sharing Attribute Rules - Predetermined +// - The loop iteration variable(s) in the associated do-loop(s) of a do, +// parallel do, taskloop, or distribute construct is (are) private. +// - The loop iteration variable in the associated do-loop of a simd construct +// with just one associated do-loop is linear with a linear-step that is the +// increment of the associated do-loop. +// - The loop iteration variables in the associated do-loops of a simd +// construct with multiple associated do-loops are lastprivate. +// +// TODO: revisit after semantics checks are completed for do-loop association of +// collapse and ordered +void OmpAttributeVisitor::PrivatizeAssociatedLoopIndex( + const parser::OpenMPLoopConstruct &x) { + std::int64_t level{GetContext().associatedLoopLevel}; + if (level <= 0) { + return; + } + Symbol::Flag ivDSA; + if (!llvm::omp::simdSet.test(GetContext().directive)) { + ivDSA = Symbol::Flag::OmpPrivate; + } else if (level == 1) { + ivDSA = Symbol::Flag::OmpLinear; + } else { + ivDSA = Symbol::Flag::OmpLastPrivate; + } + + const auto &outer{std::get>(x.t)}; + for (const parser::DoConstruct *loop{&*outer}; loop && level > 0; --level) { + // go through all the nested do-loops and resolve index variables + const parser::Name &iv{GetLoopIndex(*loop)}; + if (auto *symbol{ResolveOmp(iv, ivDSA, currScope())}) { + symbol->set(Symbol::Flag::OmpPreDetermined); + iv.symbol = symbol; // adjust the symbol within region + AddToContextObjectWithDSA(*symbol, ivDSA); + } + + const auto &block{std::get(loop->t)}; + const auto it{block.begin()}; + loop = it != block.end() ? GetDoConstructIf(*it) : nullptr; + } + CHECK(level == 0); +} + +bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionsConstruct &x) { + const auto &beginSectionsDir{ + std::get(x.t)}; + const auto &beginDir{ + std::get(beginSectionsDir.t)}; + switch (beginDir.v) { + case llvm::omp::Directive::OMPD_parallel_sections: + case llvm::omp::Directive::OMPD_sections: + PushContext(beginDir.source, beginDir.v); + break; + default: + break; + } + ClearDataSharingAttributeObjects(); + return true; +} + +bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { + PushContext(x.source, llvm::omp::Directive::OMPD_threadprivate); + const auto &list{std::get(x.t)}; + ResolveOmpObjectList(list, Symbol::Flag::OmpThreadprivate); + return false; +} + +void OmpAttributeVisitor::Post(const parser::OmpDefaultClause &x) { + if (!dirContext_.empty()) { + switch (x.v) { + case parser::OmpDefaultClause::Type::Private: + SetContextDefaultDSA(Symbol::Flag::OmpPrivate); + break; + case parser::OmpDefaultClause::Type::Firstprivate: + SetContextDefaultDSA(Symbol::Flag::OmpFirstPrivate); + break; + case parser::OmpDefaultClause::Type::Shared: + SetContextDefaultDSA(Symbol::Flag::OmpShared); + break; + case parser::OmpDefaultClause::Type::None: + SetContextDefaultDSA(Symbol::Flag::OmpNone); + break; + } + } +} + +// For OpenMP constructs, check all the data-refs within the constructs +// and adjust the symbol for each Name if necessary +void OmpAttributeVisitor::Post(const parser::Name &name) { + auto *symbol{name.symbol}; + if (symbol && !dirContext_.empty() && GetContext().withinConstruct) { + if (!symbol->owner().IsDerivedType() && !symbol->has() && + !IsObjectWithDSA(*symbol)) { + // TODO: create a separate function to go through the rules for + // predetermined, explicitly determined, and implicitly + // determined data-sharing attributes (2.15.1.1). + if (Symbol * found{currScope().FindSymbol(name.source)}) { + if (symbol != found) { + name.symbol = found; // adjust the symbol within region + } else if (GetContext().defaultDSA == Symbol::Flag::OmpNone) { + context_.Say(name.source, + "The DEFAULT(NONE) clause requires that '%s' must be listed in " + "a data-sharing attribute clause"_err_en_US, + symbol->name()); + } + } + } + } // within OpenMP construct +} + +Symbol *OmpAttributeVisitor::ResolveOmpCommonBlockName( + const parser::Name *name) { + if (auto *prev{name + ? GetContext().scope.parent().FindCommonBlock(name->source) + : nullptr}) { + name->symbol = prev; + return prev; + } else { + return nullptr; + } +} + +void OmpAttributeVisitor::ResolveOmpObjectList( + const parser::OmpObjectList &ompObjectList, Symbol::Flag ompFlag) { + for (const auto &ompObject : ompObjectList.v) { + ResolveOmpObject(ompObject, ompFlag); + } +} + +void OmpAttributeVisitor::ResolveOmpObject( + const parser::OmpObject &ompObject, Symbol::Flag ompFlag) { + std::visit( + common::visitors{ + [&](const parser::Designator &designator) { + if (const auto *name{GetDesignatorNameIfDataRef(designator)}) { + if (auto *symbol{ResolveOmp(*name, ompFlag, currScope())}) { + AddToContextObjectWithDSA(*symbol, ompFlag); + if (dataSharingAttributeFlags.test(ompFlag)) { + CheckMultipleAppearances(*name, *symbol, ompFlag); + } + } + } else { + // Array sections to be changed to substrings as needed + if (AnalyzeExpr(context_, designator)) { + if (std::holds_alternative(designator.u)) { + context_.Say(designator.source, + "Substrings are not allowed on OpenMP " + "directives or clauses"_err_en_US); + } + } + // other checks, more TBD + } + }, + [&](const parser::Name &name) { // common block + if (auto *symbol{ResolveOmpCommonBlockName(&name)}) { + CheckMultipleAppearances( + name, *symbol, Symbol::Flag::OmpCommonBlock); + // 2.15.3 When a named common block appears in a list, it has the + // same meaning as if every explicit member of the common block + // appeared in the list + for (auto &object : symbol->get().objects()) { + if (auto *resolvedObject{ + ResolveOmp(*object, ompFlag, currScope())}) { + AddToContextObjectWithDSA(*resolvedObject, ompFlag); + } + } + } else { + context_.Say(name.source, // 2.15.3 + "COMMON block must be declared in the same scoping unit " + "in which the OpenMP directive or clause appears"_err_en_US); + } + }, + }, + ompObject.u); +} + +Symbol *OmpAttributeVisitor::ResolveOmp( + const parser::Name &name, Symbol::Flag ompFlag, Scope &scope) { + if (ompFlagsRequireNewSymbol.test(ompFlag)) { + return DeclarePrivateAccessEntity(name, ompFlag, scope); + } else { + return DeclareOrMarkOtherAccessEntity(name, ompFlag); + } +} + +Symbol *OmpAttributeVisitor::ResolveOmp( + Symbol &symbol, Symbol::Flag ompFlag, Scope &scope) { + if (ompFlagsRequireNewSymbol.test(ompFlag)) { + return DeclarePrivateAccessEntity(symbol, ompFlag, scope); + } else { + return DeclareOrMarkOtherAccessEntity(symbol, ompFlag); + } +} + +Symbol *OmpAttributeVisitor::DeclareOrMarkOtherAccessEntity( + const parser::Name &name, Symbol::Flag ompFlag) { + Symbol *prev{currScope().FindSymbol(name.source)}; + if (!name.symbol || !prev) { + return nullptr; + } else if (prev != name.symbol) { + name.symbol = prev; + } + return DeclareOrMarkOtherAccessEntity(*prev, ompFlag); +} + +Symbol *OmpAttributeVisitor::DeclareOrMarkOtherAccessEntity( + Symbol &object, Symbol::Flag ompFlag) { + if (ompFlagsRequireMark.test(ompFlag)) { + object.set(ompFlag); + } + return &object; +} + +static bool WithMultipleAppearancesOmpException( + const Symbol &symbol, Symbol::Flag flag) { + return (flag == Symbol::Flag::OmpFirstPrivate && + symbol.test(Symbol::Flag::OmpLastPrivate)) || + (flag == Symbol::Flag::OmpLastPrivate && + symbol.test(Symbol::Flag::OmpFirstPrivate)); +} + +void OmpAttributeVisitor::CheckMultipleAppearances( + const parser::Name &name, const Symbol &symbol, Symbol::Flag ompFlag) { + const auto *target{&symbol}; + if (ompFlagsRequireNewSymbol.test(ompFlag)) { + if (const auto *details{symbol.detailsIf()}) { + target = &details->symbol(); + } + } + if (HasDataSharingAttributeObject(*target) && + !WithMultipleAppearancesOmpException(symbol, ompFlag)) { + context_.Say(name.source, + "'%s' appears in more than one data-sharing clause " + "on the same OpenMP directive"_err_en_US, + name.ToString()); + } else { + AddDataSharingAttributeObject(*target); + } +} + +void ResolveAccParts( + SemanticsContext &context, const parser::ProgramUnit &node) { + if (context.IsEnabled(common::LanguageFeature::OpenACC)) { + AccAttributeVisitor{context}.Walk(node); + } +} + +void ResolveOmpParts( + SemanticsContext &context, const parser::ProgramUnit &node) { + if (context.IsEnabled(common::LanguageFeature::OpenMP)) { + OmpAttributeVisitor{context}.Walk(node); + if (!context.AnyFatalError()) { + // The data-sharing attribute of the loop iteration variable for a + // sequential loop (2.15.1.1) can only be determined when visiting + // the corresponding DoConstruct, a second walk is to adjust the + // symbols for all the data-refs of that loop iteration variable + // prior to the DoConstruct. + OmpAttributeVisitor{context}.Walk(node); + } + } +} + +} // namespace Fortran::semantics diff --git a/flang/lib/Semantics/resolve-directives.h b/flang/lib/Semantics/resolve-directives.h new file mode 100644 index 0000000000000..6ba7a06252942 --- /dev/null +++ b/flang/lib/Semantics/resolve-directives.h @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_SEMANTICS_RESOLVE_DIRECTIVES_H_ +#define FORTRAN_SEMANTICS_RESOLVE_DIRECTIVES_H_ + +namespace Fortran::parser { +struct Name; +struct ProgramUnit; +} // namespace Fortran::parser + +namespace Fortran::semantics { + +class SemanticsContext; + +// Name resolution for OpenACC and OpenMP directives +void ResolveAccParts(SemanticsContext &, const parser::ProgramUnit &); +void ResolveOmpParts(SemanticsContext &, const parser::ProgramUnit &); + +} // namespace Fortran::semantics +#endif diff --git a/flang/lib/Semantics/resolve-names-utils.h b/flang/lib/Semantics/resolve-names-utils.h index e04f58f1cc576..08db70345f152 100644 --- a/flang/lib/Semantics/resolve-names-utils.h +++ b/flang/lib/Semantics/resolve-names-utils.h @@ -11,8 +11,12 @@ // Utility functions and class for use in resolve-names.cpp. +#include "flang/Evaluate/fold.h" #include "flang/Parser/message.h" +#include "flang/Parser/tools.h" +#include "flang/Semantics/expression.h" #include "flang/Semantics/scope.h" +#include "flang/Semantics/semantics.h" #include "flang/Semantics/symbol.h" #include "flang/Semantics/type.h" #include @@ -48,6 +52,23 @@ bool IsDefinedOperator(const SourceName &); bool IsIntrinsicOperator(const SemanticsContext &, const SourceName &); bool IsLogicalConstant(const SemanticsContext &, const SourceName &); +template +MaybeIntExpr EvaluateIntExpr(SemanticsContext &context, const T &expr) { + if (MaybeExpr maybeExpr{ + Fold(context.foldingContext(), AnalyzeExpr(context, expr))}) { + if (auto *intExpr{evaluate::UnwrapExpr(*maybeExpr)}) { + return std::move(*intExpr); + } + } + return std::nullopt; +} + +template +std::optional EvaluateInt64( + SemanticsContext &context, const T &expr) { + return evaluate::ToInt64(EvaluateIntExpr(context, expr)); +} + // Analyze a generic-spec and generate a symbol name and GenericKind for it. class GenericSpecInfo { public: diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 7189b48482654..c5b42473d0113 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -7,10 +7,10 @@ #include "resolve-names.h" #include "assignment.h" -#include "check-omp-structure.h" #include "mod-file.h" #include "pointer-assignment.h" #include "program-tree.h" +#include "resolve-directives.h" #include "resolve-names-utils.h" #include "rewrite-parse-tree.h" #include "flang/Common/Fortran.h" @@ -156,6 +156,9 @@ class BaseVisitor { evaluate::FoldingContext &GetFoldingContext() const { return context_->foldingContext(); } + bool IsIntrinsic(const SourceName &name) const { + return context_->intrinsics().IsIntrinsic(name.ToString()); + } // Make a placeholder symbol for a Name that otherwise wouldn't have one. // It is not in any scope and always has MiscDetails. @@ -197,12 +200,7 @@ class BaseVisitor { } template MaybeIntExpr EvaluateIntExpr(const T &expr) { - if (MaybeExpr maybeExpr{EvaluateExpr(expr)}) { - if (auto *intExpr{evaluate::UnwrapExpr(*maybeExpr)}) { - return std::move(*intExpr); - } - } - return std::nullopt; + return semantics::EvaluateIntExpr(*context_, expr); } template @@ -845,7 +843,6 @@ class DeclarationVisitor : public ArraySpecVisitor, const parser::Name *ResolveStructureComponent( const parser::StructureComponent &); const parser::Name *ResolveDataRef(const parser::DataRef &); - const parser::Name *ResolveVariable(const parser::Variable &); const parser::Name *ResolveName(const parser::Name &); bool PassesSharedLocalityChecks(const parser::Name &name, Symbol &symbol); Symbol *NoteInterfaceName(const parser::Name &); @@ -912,6 +909,7 @@ class DeclarationVisitor : public ArraySpecVisitor, void AddSaveName(std::set &, const SourceName &); void SetSaveAttr(Symbol &); bool HandleUnrestrictedSpecificIntrinsicFunction(const parser::Name &); + bool IsUplevelReference(const Symbol &); const parser::Name *FindComponent(const parser::Name *, const parser::Name &); bool CheckInitialDataTarget(const Symbol &, const SomeExpr &, SourceName); void CheckInitialProcTarget(const Symbol &, const parser::Name &, SourceName); @@ -1081,6 +1079,71 @@ class ConstructVisitor : public virtual DeclarationVisitor { void PopAssociation(); }; +// Create scopes for OpenACC constructs +class AccVisitor : public virtual DeclarationVisitor { +public: + void AddAccSourceRange(const parser::CharBlock &); + + static bool NeedsScope(const parser::OpenACCBlockConstruct &); + + bool Pre(const parser::OpenACCBlockConstruct &); + void Post(const parser::OpenACCBlockConstruct &); + bool Pre(const parser::AccBeginBlockDirective &x) { + AddAccSourceRange(x.source); + return true; + } + void Post(const parser::AccBeginBlockDirective &) { + messageHandler().set_currStmtSource(std::nullopt); + } + bool Pre(const parser::AccEndBlockDirective &x) { + AddAccSourceRange(x.source); + return true; + } + void Post(const parser::AccEndBlockDirective &) { + messageHandler().set_currStmtSource(std::nullopt); + } + bool Pre(const parser::AccBeginLoopDirective &x) { + AddAccSourceRange(x.source); + return true; + } + void Post(const parser::AccBeginLoopDirective &x) { + messageHandler().set_currStmtSource(std::nullopt); + } +}; + +bool AccVisitor::NeedsScope(const parser::OpenACCBlockConstruct &x) { + const auto &beginBlockDir{std::get(x.t)}; + const auto &beginDir{std::get(beginBlockDir.t)}; + switch (beginDir.v) { + case llvm::acc::Directive::ACCD_data: + case llvm::acc::Directive::ACCD_host_data: + case llvm::acc::Directive::ACCD_kernels: + case llvm::acc::Directive::ACCD_parallel: + case llvm::acc::Directive::ACCD_serial: + return true; + default: + return false; + } +} + +void AccVisitor::AddAccSourceRange(const parser::CharBlock &source) { + messageHandler().set_currStmtSource(source); + currScope().AddSourceRange(source); +} + +bool AccVisitor::Pre(const parser::OpenACCBlockConstruct &x) { + if (NeedsScope(x)) { + PushScope(Scope::Kind::Block, nullptr); + } + return true; +} + +void AccVisitor::Post(const parser::OpenACCBlockConstruct &x) { + if (NeedsScope(x)) { + PopScope(); + } +} + // Create scopes for OpenMP constructs class OmpVisitor : public virtual DeclarationVisitor { public: @@ -1177,182 +1240,16 @@ void OmpVisitor::Post(const parser::OpenMPBlockConstruct &x) { } } -// Data-sharing and Data-mapping attributes for data-refs in OpenMP construct -class OmpAttributeVisitor { -public: - explicit OmpAttributeVisitor( - SemanticsContext &context, ResolveNamesVisitor &resolver) - : context_{context}, resolver_{resolver} {} - - template void Walk(const A &x) { parser::Walk(x, *this); } - - template bool Pre(const A &) { return true; } - template void Post(const A &) {} - - bool Pre(const parser::SpecificationPart &x) { - Walk(std::get>(x.t)); - return false; - } - - bool Pre(const parser::OpenMPBlockConstruct &); - void Post(const parser::OpenMPBlockConstruct &) { PopContext(); } - void Post(const parser::OmpBeginBlockDirective &) { - GetContext().withinConstruct = true; - } - - bool Pre(const parser::OpenMPLoopConstruct &); - void Post(const parser::OpenMPLoopConstruct &) { PopContext(); } - void Post(const parser::OmpBeginLoopDirective &) { - GetContext().withinConstruct = true; - } - bool Pre(const parser::DoConstruct &); - - bool Pre(const parser::OpenMPSectionsConstruct &); - void Post(const parser::OpenMPSectionsConstruct &) { PopContext(); } - - bool Pre(const parser::OpenMPThreadprivate &); - void Post(const parser::OpenMPThreadprivate &) { PopContext(); } - - // 2.15.3 Data-Sharing Attribute Clauses - void Post(const parser::OmpDefaultClause &); - bool Pre(const parser::OmpClause::Shared &x) { - ResolveOmpObjectList(x.v, Symbol::Flag::OmpShared); - return false; - } - bool Pre(const parser::OmpClause::Private &x) { - ResolveOmpObjectList(x.v, Symbol::Flag::OmpPrivate); - return false; - } - bool Pre(const parser::OmpClause::Firstprivate &x) { - ResolveOmpObjectList(x.v, Symbol::Flag::OmpFirstPrivate); - return false; - } - bool Pre(const parser::OmpClause::Lastprivate &x) { - ResolveOmpObjectList(x.v, Symbol::Flag::OmpLastPrivate); - return false; - } - - void Post(const parser::Name &); - -private: - struct OmpContext { - OmpContext( - const parser::CharBlock &source, llvm::omp::Directive d, Scope &s) - : directiveSource{source}, directive{d}, scope{s} {} - parser::CharBlock directiveSource; - llvm::omp::Directive directive; - Scope &scope; - // TODO: default DSA is implicitly determined in different ways - Symbol::Flag defaultDSA{Symbol::Flag::OmpShared}; - // variables on Data-sharing attribute clauses - std::map objectWithDSA; - bool withinConstruct{false}; - std::int64_t associatedLoopLevel{0}; - }; - // back() is the top of the stack - OmpContext &GetContext() { - CHECK(!ompContext_.empty()); - return ompContext_.back(); - } - void PushContext(const parser::CharBlock &source, llvm::omp::Directive dir) { - ompContext_.emplace_back(source, dir, context_.FindScope(source)); - } - void PopContext() { ompContext_.pop_back(); } - void SetContextDirectiveSource(parser::CharBlock &dir) { - GetContext().directiveSource = dir; - } - void SetContextDirectiveEnum(llvm::omp::Directive dir) { - GetContext().directive = dir; - } - Scope &currScope() { return GetContext().scope; } - void SetContextDefaultDSA(Symbol::Flag flag) { - GetContext().defaultDSA = flag; - } - void AddToContextObjectWithDSA( - const Symbol &symbol, Symbol::Flag flag, OmpContext &context) { - context.objectWithDSA.emplace(&symbol, flag); - } - void AddToContextObjectWithDSA(const Symbol &symbol, Symbol::Flag flag) { - AddToContextObjectWithDSA(symbol, flag, GetContext()); - } - bool IsObjectWithDSA(const Symbol &symbol) { - auto it{GetContext().objectWithDSA.find(&symbol)}; - return it != GetContext().objectWithDSA.end(); - } - - void SetContextAssociatedLoopLevel(std::int64_t level) { - GetContext().associatedLoopLevel = level; - } - std::int64_t GetAssociatedLoopLevelFromClauses(const parser::OmpClauseList &); - - Symbol &MakeAssocSymbol(const SourceName &name, Symbol &prev, Scope &scope) { - const auto pair{scope.try_emplace(name, Attrs{}, HostAssocDetails{prev})}; - return *pair.first->second; - } - Symbol &MakeAssocSymbol(const SourceName &name, Symbol &prev) { - return MakeAssocSymbol(name, prev, currScope()); - } - - static const parser::Name *GetDesignatorNameIfDataRef( - const parser::Designator &designator) { - const auto *dataRef{std::get_if(&designator.u)}; - return dataRef ? std::get_if(&dataRef->u) : nullptr; - } - - static constexpr Symbol::Flags dataSharingAttributeFlags{ - Symbol::Flag::OmpShared, Symbol::Flag::OmpPrivate, - Symbol::Flag::OmpFirstPrivate, Symbol::Flag::OmpLastPrivate, - Symbol::Flag::OmpReduction, Symbol::Flag::OmpLinear}; - - static constexpr Symbol::Flags ompFlagsRequireNewSymbol{ - Symbol::Flag::OmpPrivate, Symbol::Flag::OmpLinear, - Symbol::Flag::OmpFirstPrivate, Symbol::Flag::OmpLastPrivate, - Symbol::Flag::OmpReduction}; - - static constexpr Symbol::Flags ompFlagsRequireMark{ - Symbol::Flag::OmpThreadprivate}; - - void AddDataSharingAttributeObject(SymbolRef object) { - dataSharingAttributeObjects_.insert(object); - } - void ClearDataSharingAttributeObjects() { - dataSharingAttributeObjects_.clear(); - } - bool HasDataSharingAttributeObject(const Symbol &); - - const parser::DoConstruct *GetDoConstructIf( - const parser::ExecutionPartConstruct &); - // Predetermined DSA rules - void PrivatizeAssociatedLoopIndex(const parser::OpenMPLoopConstruct &); - const parser::Name &GetLoopIndex(const parser::DoConstruct &); - void ResolveSeqLoopIndexInParallelOrTaskConstruct(const parser::Name &); - - void ResolveOmpObjectList(const parser::OmpObjectList &, Symbol::Flag); - void ResolveOmpObject(const parser::OmpObject &, Symbol::Flag); - Symbol *ResolveOmp(const parser::Name &, Symbol::Flag, Scope &); - Symbol *ResolveOmp(Symbol &, Symbol::Flag, Scope &); - Symbol *ResolveOmpCommonBlockName(const parser::Name *); - Symbol *DeclarePrivateAccessEntity( - const parser::Name &, Symbol::Flag, Scope &); - Symbol *DeclarePrivateAccessEntity(Symbol &, Symbol::Flag, Scope &); - Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); - Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag); - void CheckMultipleAppearances( - const parser::Name &, const Symbol &, Symbol::Flag); - SymbolSet dataSharingAttributeObjects_; // on one directive - - SemanticsContext &context_; - ResolveNamesVisitor &resolver_; - std::vector ompContext_; // used as a stack -}; - // Walk the parse tree and resolve names to symbols. class ResolveNamesVisitor : public virtual ScopeHandler, public ModuleVisitor, public SubprogramVisitor, public ConstructVisitor, - public OmpVisitor { + public OmpVisitor, + public AccVisitor { public: + using AccVisitor::Post; + using AccVisitor::Pre; using ArraySpecVisitor::Post; using ConstructVisitor::Post; using ConstructVisitor::Pre; @@ -1450,7 +1347,6 @@ class ResolveNamesVisitor : public virtual ScopeHandler, void FinishSpecificationParts(const ProgramTree &); void FinishDerivedTypeInstantiation(Scope &); void ResolveExecutionParts(const ProgramTree &); - void ResolveOmpParts(const parser::ProgramUnit &); }; // ImplicitRules implementation @@ -1651,7 +1547,7 @@ bool AttrsVisitor::IsConflictingAttr(Attr attrName) { return HaveAttrConflict(attrName, Attr::INTENT_IN, Attr::INTENT_INOUT) || HaveAttrConflict(attrName, Attr::INTENT_IN, Attr::INTENT_OUT) || HaveAttrConflict(attrName, Attr::INTENT_INOUT, Attr::INTENT_OUT) || - HaveAttrConflict(attrName, Attr::PASS, Attr::NOPASS) || + HaveAttrConflict(attrName, Attr::PASS, Attr::NOPASS) || // C781 HaveAttrConflict(attrName, Attr::PURE, Attr::IMPURE) || HaveAttrConflict(attrName, Attr::PUBLIC, Attr::PRIVATE) || HaveAttrConflict(attrName, Attr::RECURSIVE, Attr::NON_RECURSIVE); @@ -2015,7 +1911,8 @@ void ScopeHandler::Say2(const parser::Name &name, MessageFixedText &&msg1, Scope &ScopeHandler::InclusiveScope() { for (auto *scope{&currScope()};; scope = &scope->parent()) { - if (scope->kind() != Scope::Kind::Block && !scope->IsDerivedType()) { + if (scope->kind() != Scope::Kind::Block && !scope->IsDerivedType() && + !scope->IsStmtFunction()) { return *scope; } } @@ -2153,14 +2050,14 @@ static bool NeedsType(const Symbol &symbol) { }, symbol.details()); } + void ScopeHandler::ApplyImplicitRules(Symbol &symbol) { if (NeedsType(symbol)) { if (const DeclTypeSpec * type{GetImplicitType(symbol)}) { symbol.set(Symbol::Flag::Implicit); symbol.SetType(*type); } else if (symbol.has() && - !symbol.attrs().test(Attr::EXTERNAL) && - context().intrinsics().IsIntrinsic(symbol.name().ToString())) { + !symbol.attrs().test(Attr::EXTERNAL) && IsIntrinsic(symbol.name())) { // type will be determined in expression semantics symbol.attrs().set(Attr::INTRINSIC); } else if (!context().HasError(symbol)) { @@ -2169,6 +2066,7 @@ void ScopeHandler::ApplyImplicitRules(Symbol &symbol) { } } } + const DeclTypeSpec *ScopeHandler::GetImplicitType(Symbol &symbol) { const DeclTypeSpec *type{implicitRules().GetType(symbol.name().begin()[0])}; if (type) { @@ -2692,6 +2590,7 @@ bool SubprogramVisitor::HandleStmtFunction(const parser::StmtFunctionStmt &x) { return true; } auto &symbol{PushSubprogramScope(name, Symbol::Flag::Function)}; + symbol.set(Symbol::Flag::StmtFunction); EraseSymbol(symbol); // removes symbol added by PushSubprogramScope auto &details{symbol.get()}; for (const auto &dummyName : std::get>(x.t)) { @@ -3285,8 +3184,7 @@ bool DeclarationVisitor::Pre(const parser::Enumerator &enumerator) { if (auto &init{std::get>( enumerator.t)}) { Walk(*init); // Resolve names in expression before evaluation. - MaybeIntExpr expr{EvaluateIntExpr(*init)}; - if (auto value{evaluate::ToInt64(expr)}) { + if (auto value{EvaluateInt64(context(), *init)}) { // Cast all init expressions to C_INT so that they can then be // safely incremented (see 7.6 Note 2). enumerationState_.value = static_cast(*value); @@ -3391,8 +3289,7 @@ bool DeclarationVisitor::HandleAttributeStmt( } Symbol &DeclarationVisitor::HandleAttributeStmt( Attr attr, const parser::Name &name) { - if (attr == Attr::INTRINSIC && - !context().intrinsics().IsIntrinsic(name.source.ToString())) { + if (attr == Attr::INTRINSIC && !IsIntrinsic(name.source)) { Say(name.source, "'%s' is not a known intrinsic procedure"_err_en_US); } auto *symbol{FindInScope(currScope(), name)}; @@ -5250,8 +5147,7 @@ void ConstructVisitor::Post(const parser::SelectRankCaseStmt::Rank &x) { SetTypeFromAssociation(*symbol); SetAttrsFromAssociation(*symbol); if (const auto *init{std::get_if(&x.u)}) { - MaybeIntExpr expr{EvaluateIntExpr(*init)}; - if (auto val{evaluate::ToInt64(expr)}) { + if (auto val{EvaluateInt64(context(), *init)}) { auto &details{symbol->get()}; details.set_rank(*val); } @@ -5527,28 +5423,6 @@ const parser::Name *DeclarationVisitor::ResolveDataRef( x.u); } -const parser::Name *DeclarationVisitor::ResolveVariable( - const parser::Variable &x) { - return std::visit( - common::visitors{ - [&](const Indirection &y) { - return ResolveDesignator(y.value()); - }, - [&](const Indirection &y) { - const auto &proc{ - std::get(y.value().v.t)}; - return std::visit(common::visitors{ - [&](const parser::Name &z) { return &z; }, - [&](const parser::ProcComponentRef &z) { - return ResolveStructureComponent(z.v.thing); - }, - }, - proc.u); - }, - }, - x.u); -} - // If implicit types are allowed, ensure name is in the symbol table. // Otherwise, report an error if it hasn't been declared. const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) { @@ -5556,7 +5430,10 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) { if (CheckUseError(name)) { return nullptr; // reported an error } - if (IsDummy(*symbol) || + if (IsUplevelReference(*symbol)) { + name.symbol = nullptr; + MakeSymbol(name, HostAssocDetails{*symbol}); + } else if (IsDummy(*symbol) || (!symbol->GetType() && FindCommonBlockContaining(*symbol))) { ConvertToObjectEntity(*symbol); ApplyImplicitRules(*symbol); @@ -5580,6 +5457,16 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) { return &name; } +bool DeclarationVisitor::IsUplevelReference(const Symbol &symbol) { + const Scope *symbolUnit{FindProgramUnitContaining(symbol)}; + if (symbolUnit == FindProgramUnitContaining(currScope())) { + return false; + } else { + Scope::Kind kind{DEREF(symbolUnit).kind()}; + return kind == Scope::Kind::Subprogram || kind == Scope::Kind::MainProgram; + } +} + // base is a part-ref of a derived type; find the named component in its type. // Also handles intrinsic type parameter inquiries (%kind, %len) and // COMPLEX component references (%re, %im). @@ -5793,22 +5680,25 @@ void DeclarationVisitor::NonPointerInitialization(const parser::Name &name, const parser::ConstantExpr &expr, bool inComponentDecl) { if (name.symbol) { Symbol &ultimate{name.symbol->GetUltimate()}; - if (IsPointer(ultimate)) { - Say(name, "'%s' is a pointer but is not initialized like one"_err_en_US); - } else if (auto *details{ultimate.detailsIf()}) { - CHECK(!details->init()); - Walk(expr); - // TODO: check C762 - all bounds and type parameters of component - // are colons or constant expressions if component is initialized - if (inComponentDecl) { - // Can't convert to type of component, which might not yet - // be known; that's done later during instantiation. - if (MaybeExpr value{EvaluateExpr(expr)}) { - details->set_init(std::move(*value)); + if (!context().HasError(ultimate)) { + if (IsPointer(ultimate)) { + Say(name, + "'%s' is a pointer but is not initialized like one"_err_en_US); + } else if (auto *details{ultimate.detailsIf()}) { + CHECK(!details->init()); + Walk(expr); + // TODO: check C762 - all bounds and type parameters of component + // are colons or constant expressions if component is initialized + if (inComponentDecl) { + // Can't convert to type of component, which might not yet + // be known; that's done later during instantiation. + if (MaybeExpr value{EvaluateExpr(expr)}) { + details->set_init(std::move(*value)); + } + } else if (MaybeExpr folded{EvaluateConvertedExpr( + ultimate, expr, expr.thing.value().source)}) { + details->set_init(std::move(*folded)); } - } else if (MaybeExpr folded{EvaluateConvertedExpr( - ultimate, expr, expr.thing.value().source)}) { - details->set_init(std::move(*folded)); } } } @@ -5830,7 +5720,7 @@ void ResolveNamesVisitor::HandleProcedureName( CHECK(flag == Symbol::Flag::Function || flag == Symbol::Flag::Subroutine); auto *symbol{FindSymbol(NonDerivedTypeScope(), name)}; if (!symbol) { - if (context().intrinsics().IsIntrinsic(name.source.ToString())) { + if (IsIntrinsic(name.source)) { symbol = &MakeSymbol(InclusiveScope(), name.source, Attrs{Attr::INTRINSIC}); } else { @@ -5859,7 +5749,12 @@ void ResolveNamesVisitor::HandleProcedureName( // error was reported } else { symbol = &Resolve(name, symbol)->GetUltimate(); - ConvertToProcEntity(*symbol); + if (ConvertToProcEntity(*symbol) && IsIntrinsic(symbol->name()) && + !IsDummy(*symbol)) { + symbol->attrs().set(Attr::INTRINSIC); + // 8.2(3): ignore type from intrinsic in type-declaration-stmt + symbol->get().set_interface(ProcInterface{}); + } if (!SetProcFlag(name, *symbol, flag)) { return; // reported error } @@ -6273,7 +6168,8 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) { inExecutionPart_ = true; ResolveExecutionParts(root); inExecutionPart_ = false; - ResolveOmpParts(x); + ResolveAccParts(context(), x); + ResolveOmpParts(context(), x); return false; } @@ -6466,431 +6362,6 @@ class DeferredCheckVisitor { bool pushedScope_{false}; }; -bool OmpAttributeVisitor::Pre(const parser::OpenMPBlockConstruct &x) { - const auto &beginBlockDir{std::get(x.t)}; - const auto &beginDir{std::get(beginBlockDir.t)}; - switch (beginDir.v) { - case llvm::omp::Directive::OMPD_master: - case llvm::omp::Directive::OMPD_ordered: - case llvm::omp::Directive::OMPD_parallel: - case llvm::omp::Directive::OMPD_single: - case llvm::omp::Directive::OMPD_target: - case llvm::omp::Directive::OMPD_target_data: - case llvm::omp::Directive::OMPD_task: - case llvm::omp::Directive::OMPD_teams: - case llvm::omp::Directive::OMPD_workshare: - case llvm::omp::Directive::OMPD_parallel_workshare: - case llvm::omp::Directive::OMPD_target_teams: - case llvm::omp::Directive::OMPD_target_parallel: - PushContext(beginDir.source, beginDir.v); - break; - default: - // TODO others - break; - } - ClearDataSharingAttributeObjects(); - return true; -} - -bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) { - const auto &beginLoopDir{std::get(x.t)}; - const auto &beginDir{std::get(beginLoopDir.t)}; - const auto &clauseList{std::get(beginLoopDir.t)}; - switch (beginDir.v) { - case llvm::omp::Directive::OMPD_distribute: - case llvm::omp::Directive::OMPD_distribute_parallel_do: - case llvm::omp::Directive::OMPD_distribute_parallel_do_simd: - case llvm::omp::Directive::OMPD_distribute_simd: - case llvm::omp::Directive::OMPD_do: - case llvm::omp::Directive::OMPD_do_simd: - case llvm::omp::Directive::OMPD_parallel_do: - case llvm::omp::Directive::OMPD_parallel_do_simd: - case llvm::omp::Directive::OMPD_simd: - case llvm::omp::Directive::OMPD_target_parallel_do: - case llvm::omp::Directive::OMPD_target_parallel_do_simd: - case llvm::omp::Directive::OMPD_target_teams_distribute: - case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do: - case llvm::omp::Directive::OMPD_target_teams_distribute_parallel_do_simd: - case llvm::omp::Directive::OMPD_target_teams_distribute_simd: - case llvm::omp::Directive::OMPD_target_simd: - case llvm::omp::Directive::OMPD_taskloop: - case llvm::omp::Directive::OMPD_taskloop_simd: - case llvm::omp::Directive::OMPD_teams_distribute: - case llvm::omp::Directive::OMPD_teams_distribute_parallel_do: - case llvm::omp::Directive::OMPD_teams_distribute_parallel_do_simd: - case llvm::omp::Directive::OMPD_teams_distribute_simd: - PushContext(beginDir.source, beginDir.v); - break; - default: - break; - } - ClearDataSharingAttributeObjects(); - SetContextAssociatedLoopLevel(GetAssociatedLoopLevelFromClauses(clauseList)); - PrivatizeAssociatedLoopIndex(x); - return true; -} - -const parser::Name &OmpAttributeVisitor::GetLoopIndex( - const parser::DoConstruct &x) { - auto &loopControl{x.GetLoopControl().value()}; - using Bounds = parser::LoopControl::Bounds; - const Bounds &bounds{std::get(loopControl.u)}; - return bounds.name.thing; -} - -void OmpAttributeVisitor::ResolveSeqLoopIndexInParallelOrTaskConstruct( - const parser::Name &iv) { - auto targetIt{ompContext_.rbegin()}; - for (;; ++targetIt) { - if (targetIt == ompContext_.rend()) { - return; - } - if (llvm::omp::parallelSet.test(targetIt->directive) || - llvm::omp::taskGeneratingSet.test(targetIt->directive)) { - break; - } - } - if (auto *symbol{ResolveOmp(iv, Symbol::Flag::OmpPrivate, targetIt->scope)}) { - targetIt++; - symbol->set(Symbol::Flag::OmpPreDetermined); - iv.symbol = symbol; // adjust the symbol within region - for (auto it{ompContext_.rbegin()}; it != targetIt; ++it) { - AddToContextObjectWithDSA(*symbol, Symbol::Flag::OmpPrivate, *it); - } - } -} - -// 2.15.1.1 Data-sharing Attribute Rules - Predetermined -// - A loop iteration variable for a sequential loop in a parallel -// or task generating construct is private in the innermost such -// construct that encloses the loop -bool OmpAttributeVisitor::Pre(const parser::DoConstruct &x) { - if (!ompContext_.empty() && GetContext().withinConstruct) { - if (const auto &iv{GetLoopIndex(x)}; iv.symbol) { - if (!iv.symbol->test(Symbol::Flag::OmpPreDetermined)) { - ResolveSeqLoopIndexInParallelOrTaskConstruct(iv); - } else { - // TODO: conflict checks with explicitly determined DSA - } - } - } - return true; -} - -const parser::DoConstruct *OmpAttributeVisitor::GetDoConstructIf( - const parser::ExecutionPartConstruct &x) { - if (auto *y{std::get_if(&x.u)}) { - if (auto *z{std::get_if>(&y->u)}) { - return &z->value(); - } - } - return nullptr; -} - -std::int64_t OmpAttributeVisitor::GetAssociatedLoopLevelFromClauses( - const parser::OmpClauseList &x) { - std::int64_t orderedLevel{0}; - std::int64_t collapseLevel{0}; - for (const auto &clause : x.v) { - if (const auto *orderedClause{ - std::get_if(&clause.u)}) { - if (const auto v{ - evaluate::ToInt64(resolver_.EvaluateIntExpr(orderedClause->v))}) { - orderedLevel = *v; - } - } - if (const auto *collapseClause{ - std::get_if(&clause.u)}) { - if (const auto v{evaluate::ToInt64( - resolver_.EvaluateIntExpr(collapseClause->v))}) { - collapseLevel = *v; - } - } - } - - if (orderedLevel && (!collapseLevel || orderedLevel >= collapseLevel)) { - return orderedLevel; - } else if (!orderedLevel && collapseLevel) { - return collapseLevel; - } // orderedLevel < collapseLevel is an error handled in structural checks - return 1; // default is outermost loop -} - -// 2.15.1.1 Data-sharing Attribute Rules - Predetermined -// - The loop iteration variable(s) in the associated do-loop(s) of a do, -// parallel do, taskloop, or distribute construct is (are) private. -// - The loop iteration variable in the associated do-loop of a simd construct -// with just one associated do-loop is linear with a linear-step that is the -// increment of the associated do-loop. -// - The loop iteration variables in the associated do-loops of a simd -// construct with multiple associated do-loops are lastprivate. -// -// TODO: revisit after semantics checks are completed for do-loop association of -// collapse and ordered -void OmpAttributeVisitor::PrivatizeAssociatedLoopIndex( - const parser::OpenMPLoopConstruct &x) { - std::int64_t level{GetContext().associatedLoopLevel}; - if (level <= 0) - return; - Symbol::Flag ivDSA{Symbol::Flag::OmpPrivate}; - if (llvm::omp::simdSet.test(GetContext().directive)) { - if (level == 1) { - ivDSA = Symbol::Flag::OmpLinear; - } else { - ivDSA = Symbol::Flag::OmpLastPrivate; - } - } - - auto &outer{std::get>(x.t)}; - for (const parser::DoConstruct *loop{&*outer}; loop && level > 0; --level) { - // go through all the nested do-loops and resolve index variables - const parser::Name &iv{GetLoopIndex(*loop)}; - if (auto *symbol{ResolveOmp(iv, ivDSA, currScope())}) { - symbol->set(Symbol::Flag::OmpPreDetermined); - iv.symbol = symbol; // adjust the symbol within region - AddToContextObjectWithDSA(*symbol, ivDSA); - } - - const auto &block{std::get(loop->t)}; - const auto it{block.begin()}; - loop = it != block.end() ? GetDoConstructIf(*it) : nullptr; - } - CHECK(level == 0); -} - -bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionsConstruct &x) { - const auto &beginSectionsDir{ - std::get(x.t)}; - const auto &beginDir{ - std::get(beginSectionsDir.t)}; - switch (beginDir.v) { - case llvm::omp::Directive::OMPD_parallel_sections: - case llvm::omp::Directive::OMPD_sections: - PushContext(beginDir.source, beginDir.v); - break; - default: - break; - } - ClearDataSharingAttributeObjects(); - return true; -} - -bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { - PushContext(x.source, llvm::omp::Directive::OMPD_threadprivate); - const auto &list{std::get(x.t)}; - ResolveOmpObjectList(list, Symbol::Flag::OmpThreadprivate); - return false; -} - -void OmpAttributeVisitor::Post(const parser::OmpDefaultClause &x) { - if (!ompContext_.empty()) { - switch (x.v) { - case parser::OmpDefaultClause::Type::Private: - SetContextDefaultDSA(Symbol::Flag::OmpPrivate); - break; - case parser::OmpDefaultClause::Type::Firstprivate: - SetContextDefaultDSA(Symbol::Flag::OmpFirstPrivate); - break; - case parser::OmpDefaultClause::Type::Shared: - SetContextDefaultDSA(Symbol::Flag::OmpShared); - break; - case parser::OmpDefaultClause::Type::None: - SetContextDefaultDSA(Symbol::Flag::OmpNone); - break; - } - } -} - -// For OpenMP constructs, check all the data-refs within the constructs -// and adjust the symbol for each Name if necessary -void OmpAttributeVisitor::Post(const parser::Name &name) { - auto *symbol{name.symbol}; - if (symbol && !ompContext_.empty() && GetContext().withinConstruct) { - if (!symbol->owner().IsDerivedType() && !symbol->has() && - !IsObjectWithDSA(*symbol)) { - // TODO: create a separate function to go through the rules for - // predetermined, explicitly determined, and implicitly - // determined data-sharing attributes (2.15.1.1). - if (Symbol * found{currScope().FindSymbol(name.source)}) { - if (symbol != found) { - name.symbol = found; // adjust the symbol within region - } else if (GetContext().defaultDSA == Symbol::Flag::OmpNone) { - context_.Say(name.source, - "The DEFAULT(NONE) clause requires that '%s' must be listed in " - "a data-sharing attribute clause"_err_en_US, - symbol->name()); - } - } - } - } // within OpenMP construct -} - -bool OmpAttributeVisitor::HasDataSharingAttributeObject(const Symbol &object) { - auto it{dataSharingAttributeObjects_.find(object)}; - return it != dataSharingAttributeObjects_.end(); -} - -Symbol *OmpAttributeVisitor::ResolveOmpCommonBlockName( - const parser::Name *name) { - if (auto *prev{name - ? GetContext().scope.parent().FindCommonBlock(name->source) - : nullptr}) { - name->symbol = prev; - return prev; - } else { - return nullptr; - } -} - -void OmpAttributeVisitor::ResolveOmpObjectList( - const parser::OmpObjectList &ompObjectList, Symbol::Flag ompFlag) { - for (const auto &ompObject : ompObjectList.v) { - ResolveOmpObject(ompObject, ompFlag); - } -} - -void OmpAttributeVisitor::ResolveOmpObject( - const parser::OmpObject &ompObject, Symbol::Flag ompFlag) { - std::visit( - common::visitors{ - [&](const parser::Designator &designator) { - if (const auto *name{GetDesignatorNameIfDataRef(designator)}) { - if (auto *symbol{ResolveOmp(*name, ompFlag, currScope())}) { - AddToContextObjectWithDSA(*symbol, ompFlag); - if (dataSharingAttributeFlags.test(ompFlag)) { - CheckMultipleAppearances(*name, *symbol, ompFlag); - } - } - } else if (const auto *designatorName{ - resolver_.ResolveDesignator(designator)}; - designatorName->symbol) { - // Array sections to be changed to substrings as needed - if (AnalyzeExpr(context_, designator)) { - if (std::holds_alternative(designator.u)) { - context_.Say(designator.source, - "Substrings are not allowed on OpenMP " - "directives or clauses"_err_en_US); - } - } - // other checks, more TBD - if (const auto *details{designatorName->symbol - ->detailsIf()}) { - if (details->IsArray()) { - // TODO: check Array Sections - } else if (designatorName->symbol->owner().IsDerivedType()) { - // TODO: check Structure Component - } - } - } - }, - [&](const parser::Name &name) { // common block - if (auto *symbol{ResolveOmpCommonBlockName(&name)}) { - CheckMultipleAppearances( - name, *symbol, Symbol::Flag::OmpCommonBlock); - // 2.15.3 When a named common block appears in a list, it has the - // same meaning as if every explicit member of the common block - // appeared in the list - for (auto &object : symbol->get().objects()) { - if (auto *resolvedObject{ - ResolveOmp(*object, ompFlag, currScope())}) { - AddToContextObjectWithDSA(*resolvedObject, ompFlag); - } - } - } else { - context_.Say(name.source, // 2.15.3 - "COMMON block must be declared in the same scoping unit " - "in which the OpenMP directive or clause appears"_err_en_US); - } - }, - }, - ompObject.u); -} - -Symbol *OmpAttributeVisitor::ResolveOmp( - const parser::Name &name, Symbol::Flag ompFlag, Scope &scope) { - if (ompFlagsRequireNewSymbol.test(ompFlag)) { - return DeclarePrivateAccessEntity(name, ompFlag, scope); - } else { - return DeclareOrMarkOtherAccessEntity(name, ompFlag); - } -} - -Symbol *OmpAttributeVisitor::ResolveOmp( - Symbol &symbol, Symbol::Flag ompFlag, Scope &scope) { - if (ompFlagsRequireNewSymbol.test(ompFlag)) { - return DeclarePrivateAccessEntity(symbol, ompFlag, scope); - } else { - return DeclareOrMarkOtherAccessEntity(symbol, ompFlag); - } -} - -Symbol *OmpAttributeVisitor::DeclarePrivateAccessEntity( - const parser::Name &name, Symbol::Flag ompFlag, Scope &scope) { - if (!name.symbol) { - return nullptr; // not resolved by Name Resolution step, do nothing - } - name.symbol = DeclarePrivateAccessEntity(*name.symbol, ompFlag, scope); - return name.symbol; -} - -Symbol *OmpAttributeVisitor::DeclarePrivateAccessEntity( - Symbol &object, Symbol::Flag ompFlag, Scope &scope) { - if (object.owner() != currScope()) { - auto &symbol{MakeAssocSymbol(object.name(), object, scope)}; - symbol.set(ompFlag); - return &symbol; - } else { - object.set(ompFlag); - return &object; - } -} - -Symbol *OmpAttributeVisitor::DeclareOrMarkOtherAccessEntity( - const parser::Name &name, Symbol::Flag ompFlag) { - Symbol *prev{currScope().FindSymbol(name.source)}; - if (!name.symbol || !prev) { - return nullptr; - } else if (prev != name.symbol) { - name.symbol = prev; - } - return DeclareOrMarkOtherAccessEntity(*prev, ompFlag); -} - -Symbol *OmpAttributeVisitor::DeclareOrMarkOtherAccessEntity( - Symbol &object, Symbol::Flag ompFlag) { - if (ompFlagsRequireMark.test(ompFlag)) { - object.set(ompFlag); - } - return &object; -} - -static bool WithMultipleAppearancesException( - const Symbol &symbol, Symbol::Flag ompFlag) { - return (ompFlag == Symbol::Flag::OmpFirstPrivate && - symbol.test(Symbol::Flag::OmpLastPrivate)) || - (ompFlag == Symbol::Flag::OmpLastPrivate && - symbol.test(Symbol::Flag::OmpFirstPrivate)); -} - -void OmpAttributeVisitor::CheckMultipleAppearances( - const parser::Name &name, const Symbol &symbol, Symbol::Flag ompFlag) { - const auto *target{&symbol}; - if (ompFlagsRequireNewSymbol.test(ompFlag)) { - if (const auto *details{symbol.detailsIf()}) { - target = &details->symbol(); - } - } - if (HasDataSharingAttributeObject(*target) && - !WithMultipleAppearancesException(symbol, ompFlag)) { - context_.Say(name.source, - "'%s' appears in more than one data-sharing clause " - "on the same OpenMP directive"_err_en_US, - name.ToString()); - } else { - AddDataSharingAttributeObject(*target); - } -} - // Perform checks and completions that need to happen after all of // the specification parts but before any of the execution parts. void ResolveNamesVisitor::FinishSpecificationParts(const ProgramTree &node) { @@ -6960,18 +6431,6 @@ void ResolveNamesVisitor::ResolveExecutionParts(const ProgramTree &node) { } } -void ResolveNamesVisitor::ResolveOmpParts(const parser::ProgramUnit &node) { - OmpAttributeVisitor{context(), *this}.Walk(node); - if (!context().AnyFatalError()) { - // The data-sharing attribute of the loop iteration variable for a - // sequential loop (2.15.1.1) can only be determined when visiting - // the corresponding DoConstruct, a second walk is to adjust the - // symbols for all the data-refs of that loop iteration variable - // prior to the DoConstruct. - OmpAttributeVisitor{context(), *this}.Walk(node); - } -} - void ResolveNamesVisitor::Post(const parser::Program &) { // ensure that all temps were deallocated CHECK(!attrs_); @@ -7003,4 +6462,5 @@ void ResolveSpecificationParts( visitor.ResolveSpecificationParts(node); context.set_location(std::move(originalLocation)); } + } // namespace Fortran::semantics diff --git a/flang/lib/Semantics/scope.cpp b/flang/lib/Semantics/scope.cpp index 02637ba2add7f..a2a9e1dbe9e73 100644 --- a/flang/lib/Semantics/scope.cpp +++ b/flang/lib/Semantics/scope.cpp @@ -333,6 +333,10 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Scope &scope) { return os; } +bool Scope::IsStmtFunction() const { + return symbol_ && symbol_->test(Symbol::Flag::StmtFunction); +} + bool Scope::IsParameterizedDerivedType() const { if (!IsDerivedType()) { return false; diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 2607011333964..d5ef9c76aa34c 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -179,10 +179,21 @@ bool DoesScopeContain(const Scope *maybeAncestor, const Symbol &symbol) { return DoesScopeContain(maybeAncestor, symbol.owner()); } +static const Symbol &FollowHostAssoc(const Symbol &symbol) { + for (const Symbol *s{&symbol};;) { + const auto *details{s->detailsIf()}; + if (!details) { + return *s; + } + s = &details->symbol(); + } +} + bool IsHostAssociated(const Symbol &symbol, const Scope &scope) { const Scope *subprogram{FindProgramUnitContaining(scope)}; return subprogram && - DoesScopeContain(FindProgramUnitContaining(symbol), *subprogram); + DoesScopeContain( + FindProgramUnitContaining(FollowHostAssoc(symbol)), *subprogram); } bool IsInStmtFunction(const Symbol &symbol) { diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp index 44ceb9fe08bb6..67016e85777c7 100644 --- a/flang/lib/Semantics/unparse-with-symbols.cpp +++ b/flang/lib/Semantics/unparse-with-symbols.cpp @@ -35,6 +35,11 @@ class SymbolDumpVisitor { template void Post(const parser::Statement &) { currStmt_ = std::nullopt; } + bool Pre(const parser::AccClause &clause) { + currStmt_ = clause.source; + return true; + } + void Post(const parser::AccClause &) { currStmt_ = std::nullopt; } bool Pre(const parser::OmpClause &clause) { currStmt_ = clause.source; return true; diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp index ebbe41b49b2ce..998edc954ba75 100644 --- a/flang/runtime/edit-input.cpp +++ b/flang/runtime/edit-input.cpp @@ -13,6 +13,8 @@ namespace Fortran::runtime::io { +// For fixed-width fields, initialize the number of remaining characters. +// Skip over leading blanks, then return the first non-blank character (if any). static std::optional PrepareInput( IoStatementState &io, const DataEdit &edit, std::optional &remaining) { remaining.reset(); @@ -61,7 +63,8 @@ static bool EditBOZInput(IoStatementState &io, const DataEdit &edit, void *n, return true; } -// Returns false if there's a '-' sign +// Prepares input from a field, and consumes the sign, if any. +// Returns true if there's a '-' sign. static bool ScanNumericPrefix(IoStatementState &io, const DataEdit &edit, std::optional &next, std::optional &remaining) { next = PrepareInput(io, edit, remaining); @@ -69,6 +72,7 @@ static bool ScanNumericPrefix(IoStatementState &io, const DataEdit &edit, if (next) { negative = *next == '-'; if (negative || *next == '+') { + io.SkipSpaces(remaining); next = io.NextInField(remaining); } } @@ -126,39 +130,44 @@ bool EditIntegerInput( return true; } +// Parses a REAL input number from the input source as a normalized +// fraction into a supplied buffer -- there's an optional '-', a +// decimal point, and at least one digit. The adjusted exponent value +// is returned in a reference argument. The returned value is the number +// of characters that (should) have been written to the buffer -- this can +// be larger than the buffer size and can indicate overflow. Replaces +// blanks with zeroes if appropriate. static int ScanRealInput(char *buffer, int bufferSize, IoStatementState &io, const DataEdit &edit, int &exponent) { std::optional remaining; std::optional next; int got{0}; std::optional decimalPoint; - if (ScanNumericPrefix(io, edit, next, remaining) && next) { + auto Put{[&](char ch) -> void { if (got < bufferSize) { - buffer[got++] = '-'; + buffer[got] = ch; } + ++got; + }}; + if (ScanNumericPrefix(io, edit, next, remaining)) { + Put('-'); } if (!next) { // empty field means zero - if (got < bufferSize) { - buffer[got++] = '0'; - } + Put('0'); return got; } - if (got < bufferSize) { - buffer[got++] = '.'; // input field is normalized to a fraction - } char32_t decimal = edit.modes.editingFlags & decimalComma ? ',' : '.'; - auto start{got}; - if ((*next >= 'a' && *next <= 'z') || (*next >= 'A' && *next <= 'Z')) { + char32_t first{*next >= 'a' && *next <= 'z' ? *next + 'A' - 'a' : *next}; + if (first == 'N' || first == 'I') { // NaN or infinity - convert to upper case + // Subtle: a blank field of digits could be followed by 'E' or 'D', for (; next && ((*next >= 'a' && *next <= 'z') || (*next >= 'A' && *next <= 'Z')); next = io.NextInField(remaining)) { - if (got < bufferSize) { - if (*next >= 'a' && *next <= 'z') { - buffer[got++] = *next - 'a' + 'A'; - } else { - buffer[got++] = *next; - } + if (*next >= 'a' && *next <= 'z') { + Put(*next - 'a' + 'A'); + } else { + Put(*next); } } if (next && *next == '(') { // NaN(...) @@ -167,7 +176,10 @@ static int ScanRealInput(char *buffer, int bufferSize, IoStatementState &io, } } exponent = 0; - } else if (*next == decimal || (*next >= '0' && *next <= '9')) { + } else if (first == decimal || (first >= '0' && first <= '9') || + first == 'E' || first == 'D' || first == 'Q') { + Put('.'); // input field is normalized to a fraction + auto start{got}; for (; next; next = io.NextInField(remaining)) { char32_t ch{*next}; if (ch == ' ' || ch == '\t') { @@ -180,9 +192,7 @@ static int ScanRealInput(char *buffer, int bufferSize, IoStatementState &io, if (ch == '0' && got == start && !decimalPoint) { // omit leading zeroes before the decimal } else if (ch >= '0' && ch <= '9') { - if (got < bufferSize) { - buffer[got++] = ch; - } + Put(ch); } else if (ch == decimal && !decimalPoint) { // the decimal point is *not* copied to the buffer decimalPoint = got - start; // # of digits before the decimal point @@ -190,8 +200,8 @@ static int ScanRealInput(char *buffer, int bufferSize, IoStatementState &io, break; } } - if (got == start && got < bufferSize) { - buffer[got++] = '0'; // all digits were zeroes + if (got == start) { + Put('0'); // emit at least one digit } if (next && (*next == 'e' || *next == 'E' || *next == 'd' || *next == 'D' || diff --git a/flang/runtime/file.cpp b/flang/runtime/file.cpp index 341702df995b8..6823b19e57912 100644 --- a/flang/runtime/file.cpp +++ b/flang/runtime/file.cpp @@ -18,6 +18,7 @@ #include #include #else +#include #include #endif @@ -84,8 +85,7 @@ void OpenFile::Open(OpenStatus status, std::optional action, fd_ = openfile_mkstemp(handler); } else { if (!path_.get()) { - handler.SignalError( - "FILE= is required unless STATUS='OLD' and unit is connected"); + handler.SignalError("FILE= is required"); return; } int flags{0}; @@ -134,8 +134,18 @@ void OpenFile::Open(OpenStatus status, std::optional action, mayWrite_ = *action != Action::Read; if (status == OpenStatus::Old || status == OpenStatus::Unknown) { knownSize_.reset(); +#ifndef _WIN32 + struct stat buf; + if (::fstat(fd_, &buf) == 0) { + mayPosition_ = S_ISREG(buf.st_mode); + knownSize_ = buf.st_size; + } +#else // TODO: _WIN32 + mayPosition_ = true; +#endif } else { knownSize_ = 0; + mayPosition_ = true; } } @@ -385,4 +395,11 @@ int OpenFile::PendingResult(const Terminator &terminator, int iostat) { } bool IsATerminal(int fd) { return ::isatty(fd); } + +bool IsExtant(const char *path) { return ::access(path, F_OK) == 0; } +bool MayRead(const char *path) { return ::access(path, R_OK) == 0; } +bool MayWrite(const char *path) { return ::access(path, W_OK) == 0; } +bool MayReadAndWrite(const char *path) { + return ::access(path, R_OK | W_OK) == 0; +} } // namespace Fortran::runtime::io diff --git a/flang/runtime/file.h b/flang/runtime/file.h index 1d25a91558a4c..7e7b27c4be2a4 100644 --- a/flang/runtime/file.h +++ b/flang/runtime/file.h @@ -95,5 +95,9 @@ class OpenFile { }; bool IsATerminal(int fd); +bool IsExtant(const char *path); +bool MayRead(const char *path); +bool MayWrite(const char *path); +bool MayReadAndWrite(const char *path); } // namespace Fortran::runtime::io #endif // FORTRAN_RUNTIME_FILE_H_ diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h index a4453cd172ea5..ad8bbcbcdcbdf 100644 --- a/flang/runtime/format-implementation.h +++ b/flang/runtime/format-implementation.h @@ -330,11 +330,13 @@ int FormatControl::CueUpNextDataEdit(Context &context, bool stop) { offset_ += *repeat; } else if (ch >= 'A' && ch <= 'Z') { int start{offset_ - 1}; - CharType next{Capitalize(PeekNext())}; - if (next >= 'A' && next <= 'Z') { - ++offset_; - } else { - next = '\0'; + CharType next{'\0'}; + if (ch != 'P') { // 1PE5.2 - comma not required (C1302) + CharType peek{Capitalize(PeekNext())}; + if (peek >= 'A' && peek <= 'Z') { + next = peek; + ++offset_; + } } if (ch == 'E' || (!next && diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp index 708090ac68561..f64fe97b2d233 100644 --- a/flang/runtime/io-api.cpp +++ b/flang/runtime/io-api.cpp @@ -23,6 +23,23 @@ namespace Fortran::runtime::io { +const char *InquiryKeywordHashDecode( + char *buffer, std::size_t n, InquiryKeywordHash hash) { + if (n < 1) { + return nullptr; + } + char *p{buffer + n}; + *--p = '\0'; + while (hash > 1) { + if (p < buffer) { + return nullptr; + } + *--p = 'A' + (hash % 26); + hash /= 26; + } + return hash == 1 ? p : nullptr; +} + template Cookie BeginInternalArrayListIO(const Descriptor &descriptor, void ** /*scratchArea*/, std::size_t /*scratchBytes*/, @@ -149,9 +166,6 @@ Cookie BeginExternalListIO( unit.SetDirection(DIR, handler); IoStatementState &io{unit.BeginIoStatement>( unit, sourceFile, sourceLine)}; - if constexpr (DIR == Direction::Input) { - unit.BeginReadingRecord(handler); - } return &io; } @@ -185,9 +199,6 @@ Cookie BeginExternalFormattedIO(const char *format, std::size_t formatLength, IoStatementState &io{ unit.BeginIoStatement>( unit, format, formatLength, sourceFile, sourceLine)}; - if constexpr (DIR == Direction::Input) { - unit.BeginReadingRecord(handler); - } return &io; } @@ -218,9 +229,7 @@ Cookie BeginUnformattedIO( unit, sourceFile, sourceLine)}; IoErrorHandler handler{terminator}; unit.SetDirection(DIR, handler); - if constexpr (DIR == Direction::Input) { - unit.BeginReadingRecord(handler); - } else { + if constexpr (DIR == Direction::Output) { if (unit.access == Access::Sequential && !unit.isFixedRecordLength) { // Create space for (sub)record header to be completed by // UnformattedIoStatementState::EndIoStatement() @@ -297,8 +306,8 @@ Cookie IONAME(BeginBackspace)( Cookie IONAME(BeginEndfile)( ExternalUnit unitNumber, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; - ExternalFileUnit &unit{ - ExternalFileUnit::LookUpOrCrash(unitNumber, terminator)}; + ExternalFileUnit &unit{ExternalFileUnit::LookUpOrCreateAnonymous( + unitNumber, Direction::Output, true /*formatted*/, terminator)}; return &unit.BeginIoStatement( unit, ExternalMiscIoStatementState::Endfile, sourceFile, sourceLine); } @@ -306,12 +315,50 @@ Cookie IONAME(BeginEndfile)( Cookie IONAME(BeginRewind)( ExternalUnit unitNumber, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; - ExternalFileUnit &unit{ - ExternalFileUnit::LookUpOrCrash(unitNumber, terminator)}; + ExternalFileUnit &unit{ExternalFileUnit::LookUpOrCreateAnonymous( + unitNumber, Direction::Input, true /*formatted*/, terminator)}; return &unit.BeginIoStatement( unit, ExternalMiscIoStatementState::Rewind, sourceFile, sourceLine); } +Cookie IONAME(BeginInquireUnit)( + ExternalUnit unitNumber, const char *sourceFile, int sourceLine) { + if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) { + return &unit->BeginIoStatement( + *unit, sourceFile, sourceLine); + } else { + // INQUIRE(UNIT=unrecognized unit) + Terminator oom{sourceFile, sourceLine}; + return &New{oom}(sourceFile, sourceLine) + .release() + ->ioStatementState(); + } +} + +Cookie IONAME(BeginInquireFile)(const char *path, std::size_t pathLength, + const char *sourceFile, int sourceLine) { + Terminator oom{sourceFile, sourceLine}; + auto trimmed{ + SaveDefaultCharacter(path, TrimTrailingSpaces(path, pathLength), oom)}; + if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(trimmed.get())}) { + // INQUIRE(FILE=) to a connected unit + return &unit->BeginIoStatement( + *unit, sourceFile, sourceLine); + } else { + return &New{oom}( + std::move(trimmed), sourceFile, sourceLine) + .release() + ->ioStatementState(); + } +} + +Cookie IONAME(BeginInquireIoLength)(const char *sourceFile, int sourceLine) { + Terminator oom{sourceFile, sourceLine}; + return &New{oom}(sourceFile, sourceLine) + .release() + ->ioStatementState(); +} + // Control list items void IONAME(EnableHandlers)(Cookie cookie, bool hasIoStat, bool hasErr, @@ -530,29 +577,21 @@ bool IONAME(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) { io.GetIoErrorHandler().Crash( "SetAccess() called when not in an OPEN statement"); } - ConnectionState &connection{open->GetConnectionState()}; - Access access{connection.access}; static const char *keywords[]{"SEQUENTIAL", "DIRECT", "STREAM", nullptr}; switch (IdentifyValue(keyword, length, keywords)) { case 0: - access = Access::Sequential; + open->set_access(Access::Sequential); break; case 1: - access = Access::Direct; + open->set_access(Access::Direct); break; case 2: - access = Access::Stream; + open->set_access(Access::Stream); break; default: open->SignalError(IostatErrorInKeyword, "Invalid ACCESS='%.*s'", static_cast(length), keyword); } - if (access != connection.access) { - if (open->wasExtant()) { - open->SignalError("ACCESS= may not be changed on an open unit"); - } - connection.access = access; - } return true; } @@ -669,25 +708,18 @@ bool IONAME(SetForm)(Cookie cookie, const char *keyword, std::size_t length) { io.GetIoErrorHandler().Crash( "SetEncoding() called when not in an OPEN statement"); } - bool isUnformatted{false}; static const char *keywords[]{"FORMATTED", "UNFORMATTED", nullptr}; switch (IdentifyValue(keyword, length, keywords)) { case 0: - isUnformatted = false; + open->set_isUnformatted(false); break; case 1: - isUnformatted = true; + open->set_isUnformatted(true); break; default: open->SignalError(IostatErrorInKeyword, "Invalid FORM='%.*s'", static_cast(length), keyword); } - if (isUnformatted != open->unit().isUnformatted) { - if (open->wasExtant()) { - open->SignalError("FORM= may not be changed on an open unit"); - } - open->unit().isUnformatted = isUnformatted; - } return true; } @@ -785,11 +817,10 @@ bool IONAME(SetStatus)(Cookie cookie, const char *keyword, std::size_t length) { "SetStatus() called when not in an OPEN or CLOSE statement"); } -bool IONAME(SetFile)( - Cookie cookie, const char *path, std::size_t chars, int kind) { +bool IONAME(SetFile)(Cookie cookie, const char *path, std::size_t chars) { IoStatementState &io{*cookie}; if (auto *open{io.get_if()}) { - open->set_path(path, chars, kind); + open->set_path(path, chars); return true; } io.GetIoErrorHandler().Crash( @@ -797,7 +828,8 @@ bool IONAME(SetFile)( return false; } -static bool SetInteger(int &x, int kind, int value) { +template +static bool SetInteger(INT &x, int kind, std::int64_t value) { switch (kind) { case 1: reinterpret_cast(x) = value; @@ -806,7 +838,7 @@ static bool SetInteger(int &x, int kind, int value) { reinterpret_cast(x) = value; return true; case 4: - x = value; + reinterpret_cast(x) = value; return true; case 8: reinterpret_cast(x) = value; @@ -838,6 +870,7 @@ bool IONAME(OutputDescriptor)(Cookie cookie, const Descriptor &) { bool IONAME(InputDescriptor)(Cookie cookie, const Descriptor &) { IoStatementState &io{*cookie}; + io.BeginReadingRecord(); io.GetIoErrorHandler().Crash("InputDescriptor: not yet implemented"); // TODO } @@ -855,6 +888,7 @@ bool IONAME(OutputUnformattedBlock)(Cookie cookie, const char *x, bool IONAME(InputUnformattedBlock)( Cookie cookie, char *x, std::size_t length, std::size_t elementBytes) { IoStatementState &io{*cookie}; + io.BeginReadingRecord(); if (auto *unf{io.get_if>()}) { return unf->Receive(x, length, elementBytes); } @@ -883,6 +917,7 @@ bool IONAME(InputInteger)(Cookie cookie, std::int64_t &n, int kind) { "InputInteger64() called for a non-input I/O statement"); return false; } + io.BeginReadingRecord(); if (auto edit{io.GetNextDataEdit()}) { if (edit->descriptor == DataEdit::ListDirectedNullValue) { return true; @@ -922,6 +957,7 @@ static bool InputReal(Cookie cookie, REAL &x) { "InputReal() called for a non-input I/O statement"); return false; } + io.BeginReadingRecord(); if (auto edit{io.GetNextDataEdit()}) { if (edit->descriptor == DataEdit::ListDirectedNullValue) { return true; @@ -968,6 +1004,7 @@ static bool InputComplex(Cookie cookie, REAL x[2]) { "InputComplex() called for a non-input I/O statement"); return false; } + io.BeginReadingRecord(); for (int j{0}; j < 2; ++j) { if (auto edit{io.GetNextDataEdit()}) { if (edit->descriptor == DataEdit::ListDirectedNullValue) { @@ -1012,6 +1049,7 @@ bool IONAME(InputAscii)(Cookie cookie, char *x, std::size_t length) { "InputAscii() called for a non-input I/O statement"); return false; } + io.BeginReadingRecord(); if (auto edit{io.GetNextDataEdit()}) { if (edit->descriptor == DataEdit::ListDirectedNullValue) { return true; @@ -1044,6 +1082,7 @@ bool IONAME(InputLogical)(Cookie cookie, bool &truth) { "InputLogical() called for a non-input I/O statement"); return false; } + io.BeginReadingRecord(); if (auto edit{io.GetNextDataEdit()}) { if (edit->descriptor == DataEdit::ListDirectedNullValue) { return true; @@ -1060,6 +1099,34 @@ void IONAME(GetIoMsg)(Cookie cookie, char *msg, std::size_t length) { } } +bool IONAME(InquireCharacter)(Cookie cookie, InquiryKeywordHash inquiry, + char *result, std::size_t length) { + IoStatementState &io{*cookie}; + return io.Inquire(inquiry, result, length); +} + +bool IONAME(InquireLogical)( + Cookie cookie, InquiryKeywordHash inquiry, bool &result) { + IoStatementState &io{*cookie}; + return io.Inquire(inquiry, result); +} + +bool IONAME(InquirePendingId)(Cookie cookie, std::int64_t id, bool &result) { + IoStatementState &io{*cookie}; + return io.Inquire(HashInquiryKeyword("PENDING"), id, result); +} + +bool IONAME(InquireInteger64)( + Cookie cookie, InquiryKeywordHash inquiry, std::int64_t &result, int kind) { + IoStatementState &io{*cookie}; + std::int64_t n; + if (io.Inquire(inquiry, n)) { + SetInteger(result, kind, n); + return true; + } + return false; +} + enum Iostat IONAME(EndIoStatement)(Cookie cookie) { IoStatementState &io{*cookie}; return static_cast(io.EndIoStatement()); diff --git a/flang/runtime/io-api.h b/flang/runtime/io-api.h index f6ebc63e3f3d2..a38152d6ec1c1 100644 --- a/flang/runtime/io-api.h +++ b/flang/runtime/io-api.h @@ -29,6 +29,26 @@ using ExternalUnit = int; using AsynchronousId = int; static constexpr ExternalUnit DefaultUnit{-1}; // READ(*), WRITE(*), PRINT +// INQUIRE specifiers are encoded as simple base-26 packings of +// the spellings of their keywords. +using InquiryKeywordHash = std::uint64_t; +constexpr InquiryKeywordHash HashInquiryKeyword(const char *p) { + InquiryKeywordHash hash{1}; + while (char ch{*p++}) { + std::uint64_t letter{0}; + if (ch >= 'a' && ch <= 'z') { + letter = ch - 'a'; + } else { + letter = ch - 'A'; + } + hash = 26 * hash + letter; + } + return hash; +} + +const char *InquiryKeywordHashDecode( + char *buffer, std::size_t, InquiryKeywordHash); + extern "C" { #define IONAME(name) RTNAME(io##name) @@ -150,7 +170,7 @@ Cookie IONAME(BeginOpenNewUnit)( // BeginInquireIoLength() is basically a no-op output statement. Cookie IONAME(BeginInquireUnit)( ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0); -Cookie IONAME(BeginInquireFile)(const char *, std::size_t, int kind = 1, +Cookie IONAME(BeginInquireFile)(const char *, std::size_t, const char *sourceFile = nullptr, int sourceLine = 0); Cookie IONAME(BeginInquireIoLength)( const char *sourceFile = nullptr, int sourceLine = 0); @@ -255,10 +275,7 @@ bool IONAME(SetRecl)(Cookie, std::size_t); // RECL= // For CLOSE: STATUS=KEEP, DELETE bool IONAME(SetStatus)(Cookie, const char *, std::size_t); -// SetFile() may pass a CHARACTER argument of non-default kind, -// and such filenames are converted to UTF-8 before being -// presented to the filesystem. -bool IONAME(SetFile)(Cookie, const char *, std::size_t chars, int kind = 1); +bool IONAME(SetFile)(Cookie, const char *, std::size_t chars); // Acquires the runtime-created unit number for OPEN(NEWUNIT=) bool IONAME(GetNewUnit)(Cookie, int &, int kind = 4); @@ -275,18 +292,17 @@ void IONAME(GetIoMsg)(Cookie, char *, std::size_t); // IOMSG= // INQUIRE() specifiers are mostly identified by their NUL-terminated // case-insensitive names. -// ACCESS, ACTION, ASYNCHRONOUS, BLANK, DECIMAL, DELIM, DIRECT, ENCODING, -// FORM, FORMATTED, NAME, PAD, POSITION, READ, READWRITE, ROUND, +// ACCESS, ACTION, ASYNCHRONOUS, BLANK, CONVERT, DECIMAL, DELIM, DIRECT, +// ENCODING, FORM, FORMATTED, NAME, PAD, POSITION, READ, READWRITE, ROUND, // SEQUENTIAL, SIGN, STREAM, UNFORMATTED, WRITE: -bool IONAME(InquireCharacter)( - Cookie, const char *specifier, char *, std::size_t); +bool IONAME(InquireCharacter)(Cookie, InquiryKeywordHash, char *, std::size_t); // EXIST, NAMED, OPENED, and PENDING (without ID): -bool IONAME(InquireLogical)(Cookie, const char *specifier, bool &); +bool IONAME(InquireLogical)(Cookie, InquiryKeywordHash, bool &); // PENDING with ID bool IONAME(InquirePendingId)(Cookie, std::int64_t, bool &); // NEXTREC, NUMBER, POS, RECL, SIZE bool IONAME(InquireInteger64)( - Cookie, const char *specifier, std::int64_t &, int kind = 8); + Cookie, InquiryKeywordHash, std::int64_t &, int kind = 8); // This function must be called to end an I/O statement, and its // cookie value may not be used afterwards unless it is recycled diff --git a/flang/runtime/io-error.h b/flang/runtime/io-error.h index 8d43c40ef103c..5dd7f5e03d080 100644 --- a/flang/runtime/io-error.h +++ b/flang/runtime/io-error.h @@ -38,7 +38,7 @@ class IoErrorHandler : public Terminator { void SignalError(int iostatOrErrno, const char *msg, ...); void SignalError(int iostatOrErrno); - template void SignalError(const char *msg, X &&... xs) { + template void SignalError(const char *msg, X &&...xs) { SignalError(IostatGenericError, msg, std::forward(xs)...); } diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp index b8e7781f235b0..8300b1ea3c27b 100644 --- a/flang/runtime/io-stmt.cpp +++ b/flang/runtime/io-stmt.cpp @@ -26,6 +26,37 @@ std::optional IoStatementBase::GetNextDataEdit( return std::nullopt; } +bool IoStatementBase::Inquire(InquiryKeywordHash, char *, std::size_t) { + Crash( + "IoStatementBase::Inquire() called for I/O statement other than INQUIRE"); + return false; +} + +bool IoStatementBase::Inquire(InquiryKeywordHash, bool &) { + Crash( + "IoStatementBase::Inquire() called for I/O statement other than INQUIRE"); + return false; +} + +bool IoStatementBase::Inquire(InquiryKeywordHash, std::int64_t, bool &) { + Crash( + "IoStatementBase::Inquire() called for I/O statement other than INQUIRE"); + return false; +} + +bool IoStatementBase::Inquire(InquiryKeywordHash, std::int64_t &) { + Crash( + "IoStatementBase::Inquire() called for I/O statement other than INQUIRE"); + return false; +} + +void IoStatementBase::BadInquiryKeywordHashCrash(InquiryKeywordHash inquiry) { + char buffer[16]; + const char *decode{InquiryKeywordHashDecode(buffer, sizeof buffer, inquiry)}; + Crash("bad InquiryKeywordHash 0x%x (%s)", inquiry, + decode ? decode : "(cannot decode)"); +} + template InternalIoStatementState::InternalIoStatementState( Buffer scalar, std::size_t length, const char *sourceFile, int sourceLine) @@ -151,14 +182,9 @@ int ExternalIoStatementBase::EndIoStatement() { return result; } -void OpenStatementState::set_path( - const char *path, std::size_t length, int kind) { - if (kind != 1) { // TODO - Crash("OPEN: FILE= with unimplemented: CHARACTER(KIND=%d)", kind); - } - std::size_t bytes{length * kind}; // TODO: UTF-8 encoding of Unicode path - path_ = SaveDefaultCharacter(path, bytes, *this); - pathLength_ = length; +void OpenStatementState::set_path(const char *path, std::size_t length) { + pathLength_ = TrimTrailingSpaces(path, length); + path_ = SaveDefaultCharacter(path, pathLength_, *this); } int OpenStatementState::EndIoStatement() { @@ -166,8 +192,31 @@ int OpenStatementState::EndIoStatement() { SignalError("OPEN statement for connected unit may not have STATUS= other " "than 'OLD'"); } - unit().OpenUnit(status_.value_or(OpenStatus::Unknown), action_, position_, - std::move(path_), pathLength_, convert_, *this); + if (path_.get() || wasExtant_ || + (status_ && *status_ == OpenStatus::Scratch)) { + unit().OpenUnit(status_.value_or(OpenStatus::Unknown), action_, position_, + std::move(path_), pathLength_, convert_, *this); + } else { + unit().OpenAnonymousUnit(status_.value_or(OpenStatus::Unknown), action_, + position_, convert_, *this); + } + if (access_) { + if (*access_ != unit().access) { + if (wasExtant_) { + SignalError("ACCESS= may not be changed on an open unit"); + } + } + unit().access = *access_; + } + if (!isUnformatted_) { + isUnformatted_ = unit().access != Access::Sequential; + } + if (*isUnformatted_ != unit().isUnformatted) { + if (wasExtant_) { + SignalError("FORM= may not be changed on an open unit"); + } + unit().isUnformatted = *isUnformatted_; + } return ExternalIoStatementBase::EndIoStatement(); } @@ -178,14 +227,17 @@ int CloseStatementState::EndIoStatement() { return result; } -int NoopCloseStatementState::EndIoStatement() { +int NoUnitIoStatementState::EndIoStatement() { auto result{IoStatementBase::EndIoStatement()}; FreeMemory(this); return result; } template int ExternalIoStatementState::EndIoStatement() { - if (!unit().nonAdvancing) { + if constexpr (DIR == Direction::Input) { + BeginReadingRecord(); // in case of READ with no data items + } + if (!unit().nonAdvancing && GetIoStat() != IostatEnd) { unit().AdvanceRecord(*this); } if constexpr (DIR == Direction::Output) { @@ -260,6 +312,16 @@ void ExternalIoStatementState::HandleRelativePosition(std::int64_t n) { return unit().HandleRelativePosition(n); } +template +void ExternalIoStatementState::BeginReadingRecord() { + if constexpr (DIR == Direction::Input) { + if (!beganReading_) { + beganReading_ = true; + unit().BeginReadingRecord(*this); + } + } +} + template ExternalFormattedIoStatementState::ExternalFormattedIoStatementState( ExternalFileUnit &unit, const CHAR *format, std::size_t formatLength, @@ -315,6 +377,10 @@ MutableModes &IoStatementState::mutableModes() { [](auto &x) -> MutableModes & { return x.get().mutableModes(); }, u_); } +void IoStatementState::BeginReadingRecord() { + std::visit([](auto &x) { return x.get().BeginReadingRecord(); }, u_); +} + IoErrorHandler &IoStatementState::GetIoErrorHandler() const { return std::visit( [](auto &x) -> IoErrorHandler & { @@ -437,6 +503,26 @@ bool ListDirectedStatementState::NeedAdvance( width > connection.RemainingSpaceInRecord(); } +bool IoStatementState::Inquire( + InquiryKeywordHash inquiry, char *out, std::size_t chars) { + return std::visit( + [&](auto &x) { return x.get().Inquire(inquiry, out, chars); }, u_); +} + +bool IoStatementState::Inquire(InquiryKeywordHash inquiry, bool &out) { + return std::visit([&](auto &x) { return x.get().Inquire(inquiry, out); }, u_); +} + +bool IoStatementState::Inquire( + InquiryKeywordHash inquiry, std::int64_t id, bool &out) { + return std::visit( + [&](auto &x) { return x.get().Inquire(inquiry, id, out); }, u_); +} + +bool IoStatementState::Inquire(InquiryKeywordHash inquiry, std::int64_t &n) { + return std::visit([&](auto &x) { return x.get().Inquire(inquiry, n); }, u_); +} + bool ListDirectedStatementState::EmitLeadingSpaceOrAdvance( IoStatementState &io, std::size_t length, bool isCharacter) { if (length == 0) { @@ -661,4 +747,419 @@ int ExternalMiscIoStatementState::EndIoStatement() { return ExternalIoStatementBase::EndIoStatement(); } +InquireUnitState::InquireUnitState( + ExternalFileUnit &unit, const char *sourceFile, int sourceLine) + : ExternalIoStatementBase{unit, sourceFile, sourceLine} {} + +bool InquireUnitState::Inquire( + InquiryKeywordHash inquiry, char *result, std::size_t length) { + const char *str{nullptr}; + switch (inquiry) { + case HashInquiryKeyword("ACCESS"): + switch (unit().access) { + case Access::Sequential: + str = "SEQUENTIAL"; + break; + case Access::Direct: + str = "DIRECT"; + break; + case Access::Stream: + str = "STREAM"; + break; + } + break; + case HashInquiryKeyword("ACTION"): + str = unit().mayWrite() ? unit().mayRead() ? "READWRITE" : "WRITE" : "READ"; + break; + case HashInquiryKeyword("ASYNCHRONOUS"): + str = unit().mayAsynchronous() ? "YES" : "NO"; + break; + case HashInquiryKeyword("BLANK"): + str = unit().isUnformatted ? "UNDEFINED" + : unit().modes.editingFlags & blankZero ? "ZERO" + : "NULL"; + break; + case HashInquiryKeyword("CONVERT"): + str = unit().swapEndianness() ? "SWAP" : "NATIVE"; + break; + case HashInquiryKeyword("DECIMAL"): + str = unit().isUnformatted ? "UNDEFINED" + : unit().modes.editingFlags & decimalComma ? "COMMA" + : "POINT"; + break; + case HashInquiryKeyword("DELIM"): + if (unit().isUnformatted) { + str = "UNDEFINED"; + } else { + switch (unit().modes.delim) { + case '\'': + str = "APOSTROPHE"; + break; + case '"': + str = "QUOTE"; + break; + default: + str = "NONE"; + break; + } + } + break; + case HashInquiryKeyword("DIRECT"): + str = unit().mayPosition() ? "YES" : "NO"; + break; + case HashInquiryKeyword("ENCODING"): + str = unit().isUnformatted ? "UNDEFINED" + : unit().isUTF8 ? "UTF-8" + : "ASCII"; + break; + case HashInquiryKeyword("FORM"): + str = unit().isUnformatted ? "UNFORMATTED" : "FORMATTED"; + break; + case HashInquiryKeyword("FORMATTED"): + str = "YES"; + break; + case HashInquiryKeyword("NAME"): + str = unit().path(); + if (!str) { + return true; // result is undefined + } + break; + case HashInquiryKeyword("PAD"): + str = unit().isUnformatted ? "UNDEFINED" : unit().modes.pad ? "YES" : "NO"; + break; + case HashInquiryKeyword("POSITION"): + if (unit().access == Access::Direct) { + str = "UNDEFINED"; + } else { + auto size{unit().knownSize()}; + auto pos{unit().position()}; + if (pos == size.value_or(pos + 1)) { + str = "APPEND"; + } else if (pos == 0) { + str = "REWIND"; + } else { + str = "ASIS"; // processor-dependent & no common behavior + } + } + break; + case HashInquiryKeyword("READ"): + str = unit().mayRead() ? "YES" : "NO"; + break; + case HashInquiryKeyword("READWRITE"): + str = unit().mayRead() && unit().mayWrite() ? "YES" : "NO"; + break; + case HashInquiryKeyword("ROUND"): + if (unit().isUnformatted) { + str = "UNDEFINED"; + } else { + switch (unit().modes.round) { + case decimal::FortranRounding::RoundNearest: + str = "NEAREST"; + break; + case decimal::FortranRounding::RoundUp: + str = "UP"; + break; + case decimal::FortranRounding::RoundDown: + str = "DOWN"; + break; + case decimal::FortranRounding::RoundToZero: + str = "ZERO"; + break; + case decimal::FortranRounding::RoundCompatible: + str = "COMPATIBLE"; + break; + } + } + break; + case HashInquiryKeyword("SEQUENTIAL"): + str = "YES"; + break; + case HashInquiryKeyword("SIGN"): + str = unit().isUnformatted ? "UNDEFINED" + : unit().modes.editingFlags & signPlus ? "PLUS" + : "SUPPRESS"; + break; + case HashInquiryKeyword("STREAM"): + str = "YES"; + break; + case HashInquiryKeyword("WRITE"): + str = unit().mayWrite() ? "YES" : "NO"; + break; + case HashInquiryKeyword("UNFORMATTED"): + str = "YES"; + break; + } + if (str) { + ToFortranDefaultCharacter(result, length, str); + return true; + } else { + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireUnitState::Inquire(InquiryKeywordHash inquiry, bool &result) { + switch (inquiry) { + case HashInquiryKeyword("EXIST"): + result = true; + return true; + case HashInquiryKeyword("NAMED"): + result = unit().path() != nullptr; + return true; + case HashInquiryKeyword("OPENED"): + result = true; + return true; + case HashInquiryKeyword("PENDING"): + result = false; // asynchronous I/O is not implemented + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireUnitState::Inquire( + InquiryKeywordHash inquiry, std::int64_t, bool &result) { + switch (inquiry) { + case HashInquiryKeyword("PENDING"): + result = false; // asynchronous I/O is not implemented + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireUnitState::Inquire( + InquiryKeywordHash inquiry, std::int64_t &result) { + switch (inquiry) { + case HashInquiryKeyword("NEXTREC"): + if (unit().access == Access::Direct) { + result = unit().currentRecordNumber; + } + return true; + case HashInquiryKeyword("NUMBER"): + result = unit().unitNumber(); + return true; + case HashInquiryKeyword("POS"): + result = unit().position(); + return true; + case HashInquiryKeyword("RECL"): + if (unit().access == Access::Stream) { + result = -2; + } else if (unit().isFixedRecordLength && unit().recordLength) { + result = *unit().recordLength; + } else { + result = std::numeric_limits::max(); + } + return true; + case HashInquiryKeyword("SIZE"): + if (auto size{unit().knownSize()}) { + result = *size; + } else { + result = -1; + } + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +InquireNoUnitState::InquireNoUnitState(const char *sourceFile, int sourceLine) + : NoUnitIoStatementState{sourceFile, sourceLine, *this} {} + +bool InquireNoUnitState::Inquire( + InquiryKeywordHash inquiry, char *result, std::size_t length) { + switch (inquiry) { + case HashInquiryKeyword("ACCESS"): + case HashInquiryKeyword("ACTION"): + case HashInquiryKeyword("ASYNCHRONOUS"): + case HashInquiryKeyword("BLANK"): + case HashInquiryKeyword("CONVERT"): + case HashInquiryKeyword("DECIMAL"): + case HashInquiryKeyword("DELIM"): + case HashInquiryKeyword("FORM"): + case HashInquiryKeyword("NAME"): + case HashInquiryKeyword("PAD"): + case HashInquiryKeyword("POSITION"): + case HashInquiryKeyword("ROUND"): + case HashInquiryKeyword("SIGN"): + ToFortranDefaultCharacter(result, length, "UNDEFINED"); + return true; + case HashInquiryKeyword("DIRECT"): + case HashInquiryKeyword("ENCODING"): + case HashInquiryKeyword("FORMATTED"): + case HashInquiryKeyword("READ"): + case HashInquiryKeyword("READWRITE"): + case HashInquiryKeyword("SEQUENTIAL"): + case HashInquiryKeyword("STREAM"): + case HashInquiryKeyword("WRITE"): + case HashInquiryKeyword("UNFORMATTED"): + ToFortranDefaultCharacter(result, length, "UNKNONN"); + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireNoUnitState::Inquire(InquiryKeywordHash inquiry, bool &result) { + switch (inquiry) { + case HashInquiryKeyword("EXIST"): + result = true; + return true; + case HashInquiryKeyword("NAMED"): + case HashInquiryKeyword("OPENED"): + case HashInquiryKeyword("PENDING"): + result = false; + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireNoUnitState::Inquire( + InquiryKeywordHash inquiry, std::int64_t, bool &result) { + switch (inquiry) { + case HashInquiryKeyword("PENDING"): + result = false; + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireNoUnitState::Inquire( + InquiryKeywordHash inquiry, std::int64_t &result) { + switch (inquiry) { + case HashInquiryKeyword("NEXTREC"): + case HashInquiryKeyword("NUMBER"): + case HashInquiryKeyword("POS"): + case HashInquiryKeyword("RECL"): + case HashInquiryKeyword("SIZE"): + result = -1; + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +InquireUnconnectedFileState::InquireUnconnectedFileState( + OwningPtr &&path, const char *sourceFile, int sourceLine) + : NoUnitIoStatementState{sourceFile, sourceLine, *this}, path_{std::move( + path)} {} + +bool InquireUnconnectedFileState::Inquire( + InquiryKeywordHash inquiry, char *result, std::size_t length) { + const char *str{nullptr}; + switch (inquiry) { + case HashInquiryKeyword("ACCESS"): + case HashInquiryKeyword("ACTION"): + case HashInquiryKeyword("ASYNCHRONOUS"): + case HashInquiryKeyword("BLANK"): + case HashInquiryKeyword("CONVERT"): + case HashInquiryKeyword("DECIMAL"): + case HashInquiryKeyword("DELIM"): + case HashInquiryKeyword("FORM"): + case HashInquiryKeyword("PAD"): + case HashInquiryKeyword("POSITION"): + case HashInquiryKeyword("ROUND"): + case HashInquiryKeyword("SIGN"): + str = "UNDEFINED"; + break; + case HashInquiryKeyword("DIRECT"): + case HashInquiryKeyword("ENCODING"): + str = "UNKNONN"; + break; + case HashInquiryKeyword("READ"): + str = MayRead(path_.get()) ? "YES" : "NO"; + break; + case HashInquiryKeyword("READWRITE"): + str = MayReadAndWrite(path_.get()) ? "YES" : "NO"; + break; + case HashInquiryKeyword("WRITE"): + str = MayWrite(path_.get()) ? "YES" : "NO"; + break; + case HashInquiryKeyword("FORMATTED"): + case HashInquiryKeyword("SEQUENTIAL"): + case HashInquiryKeyword("STREAM"): + case HashInquiryKeyword("UNFORMATTED"): + str = "YES"; + break; + case HashInquiryKeyword("NAME"): + str = path_.get(); + return true; + } + if (str) { + ToFortranDefaultCharacter(result, length, str); + return true; + } else { + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireUnconnectedFileState::Inquire( + InquiryKeywordHash inquiry, bool &result) { + switch (inquiry) { + case HashInquiryKeyword("EXIST"): + result = IsExtant(path_.get()); + return true; + case HashInquiryKeyword("NAMED"): + result = true; + return true; + case HashInquiryKeyword("OPENED"): + result = false; + return true; + case HashInquiryKeyword("PENDING"): + result = false; + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireUnconnectedFileState::Inquire( + InquiryKeywordHash inquiry, std::int64_t, bool &result) { + switch (inquiry) { + case HashInquiryKeyword("PENDING"): + result = false; + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +bool InquireUnconnectedFileState::Inquire( + InquiryKeywordHash inquiry, std::int64_t &result) { + switch (inquiry) { + case HashInquiryKeyword("NEXTREC"): + case HashInquiryKeyword("NUMBER"): + case HashInquiryKeyword("POS"): + case HashInquiryKeyword("RECL"): + case HashInquiryKeyword("SIZE"): + result = -1; + return true; + default: + BadInquiryKeywordHashCrash(inquiry); + return false; + } +} + +InquireIOLengthState::InquireIOLengthState( + const char *sourceFile, int sourceLine) + : NoUnitIoStatementState{sourceFile, sourceLine, *this} {} + +bool InquireIOLengthState::Emit( + const char *, std::size_t n, std::size_t /*elementBytes*/) { + bytes_ += n; + return true; +} + } // namespace Fortran::runtime::io diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h index 755e5946ff3b9..9e68deab2e641 100644 --- a/flang/runtime/io-stmt.h +++ b/flang/runtime/io-stmt.h @@ -16,6 +16,7 @@ #include "file.h" #include "format.h" #include "internal-unit.h" +#include "io-api.h" #include "io-error.h" #include #include @@ -26,6 +27,11 @@ namespace Fortran::runtime::io { class ExternalFileUnit; class OpenStatementState; +class InquireUnitState; +class InquireNoUnitState; +class InquireUnconnectedFileState; +class InquireIOLengthState; +class ExternalMiscIoStatementState; class CloseStatementState; class NoopCloseStatementState; @@ -36,7 +42,6 @@ template class ExternalFormattedIoStatementState; template class ExternalListIoStatementState; template class UnformattedIoStatementState; -class ExternalMiscIoStatementState; // The Cookie type in the I/O API is a pointer (for C) to this class. class IoStatementState { @@ -59,6 +64,11 @@ class IoStatementState { IoErrorHandler &GetIoErrorHandler() const; ExternalFileUnit *GetExternalFileUnit() const; // null if internal unit MutableModes &mutableModes(); + void BeginReadingRecord(); + bool Inquire(InquiryKeywordHash, char *, std::size_t); + bool Inquire(InquiryKeywordHash, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t, bool &); // PENDING= + bool Inquire(InquiryKeywordHash, std::int64_t &); // N.B.: this also works with base classes template A *get_if() const { @@ -97,6 +107,10 @@ class IoStatementState { std::reference_wrapper>, std::reference_wrapper>, std::reference_wrapper>, + std::reference_wrapper, + std::reference_wrapper, + std::reference_wrapper, + std::reference_wrapper, std::reference_wrapper> u_; }; @@ -108,6 +122,13 @@ struct IoStatementBase : public DefaultFormatControlCallbacks { int EndIoStatement(); std::optional GetNextDataEdit(IoStatementState &, int = 1); ExternalFileUnit *GetExternalFileUnit() const { return nullptr; } + void BeginReadingRecord() {} + + bool Inquire(InquiryKeywordHash, char *, std::size_t); + bool Inquire(InquiryKeywordHash, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t &); + void BadInquiryKeywordHashCrash(InquiryKeywordHash); }; struct InputStatementState {}; @@ -247,6 +268,10 @@ class ExternalIoStatementState : public ExternalIoStatementBase, void BackspaceRecord(); void HandleRelativePosition(std::int64_t); void HandleAbsolutePosition(std::int64_t); + void BeginReadingRecord(); + +private: + bool beganReading_{false}; }; template @@ -297,10 +322,12 @@ class OpenStatementState : public ExternalIoStatementBase { wasExtant} {} bool wasExtant() const { return wasExtant_; } void set_status(OpenStatus status) { status_ = status; } // STATUS= - void set_path(const char *, std::size_t, int kind); // FILE= + void set_path(const char *, std::size_t); // FILE= void set_position(Position position) { position_ = position; } // POSITION= void set_action(Action action) { action_ = action; } // ACTION= void set_convert(Convert convert) { convert_ = convert; } // CONVERT= + void set_access(Access access) { access_ = access; } // ACCESS= + void set_isUnformatted(bool yes = true) { isUnformatted_ = yes; } // FORM= int EndIoStatement(); private: @@ -311,6 +338,8 @@ class OpenStatementState : public ExternalIoStatementBase { Convert convert_{Convert::Native}; OwningPtr path_; std::size_t pathLength_; + std::optional isUnformatted_; + std::optional access_; }; class CloseStatementState : public ExternalIoStatementBase { @@ -325,21 +354,31 @@ class CloseStatementState : public ExternalIoStatementBase { CloseStatus status_{CloseStatus::Keep}; }; -class NoopCloseStatementState : public IoStatementBase { +// For CLOSE(bad unit) and INQUIRE(unconnected unit) +class NoUnitIoStatementState : public IoStatementBase { public: - NoopCloseStatementState(const char *sourceFile, int sourceLine) - : IoStatementBase{sourceFile, sourceLine}, ioStatementState_{*this} {} IoStatementState &ioStatementState() { return ioStatementState_; } - void set_status(CloseStatus) {} // discards MutableModes &mutableModes() { return connection_.modes; } ConnectionState &GetConnectionState() { return connection_; } int EndIoStatement(); +protected: + template + NoUnitIoStatementState(const char *sourceFile, int sourceLine, A &stmt) + : IoStatementBase{sourceFile, sourceLine}, ioStatementState_{stmt} {} + private: IoStatementState ioStatementState_; // points to *this ConnectionState connection_; }; +class NoopCloseStatementState : public NoUnitIoStatementState { +public: + NoopCloseStatementState(const char *sourceFile, int sourceLine) + : NoUnitIoStatementState{sourceFile, sourceLine, *this} {} + void set_status(CloseStatus) {} // discards +}; + extern template class InternalIoStatementState; extern template class InternalIoStatementState; extern template class InternalFormattedIoStatementState; @@ -363,6 +402,49 @@ extern template class FormatControl< extern template class FormatControl< ExternalFormattedIoStatementState>; +class InquireUnitState : public ExternalIoStatementBase { +public: + InquireUnitState(ExternalFileUnit &unit, const char *sourceFile = nullptr, + int sourceLine = 0); + bool Inquire(InquiryKeywordHash, char *, std::size_t); + bool Inquire(InquiryKeywordHash, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t &); +}; + +class InquireNoUnitState : public NoUnitIoStatementState { +public: + InquireNoUnitState(const char *sourceFile = nullptr, int sourceLine = 0); + bool Inquire(InquiryKeywordHash, char *, std::size_t); + bool Inquire(InquiryKeywordHash, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t &); +}; + +class InquireUnconnectedFileState : public NoUnitIoStatementState { +public: + InquireUnconnectedFileState(OwningPtr &&path, + const char *sourceFile = nullptr, int sourceLine = 0); + bool Inquire(InquiryKeywordHash, char *, std::size_t); + bool Inquire(InquiryKeywordHash, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t, bool &); + bool Inquire(InquiryKeywordHash, std::int64_t &); + +private: + OwningPtr path_; // trimmed and NUL terminated +}; + +class InquireIOLengthState : public NoUnitIoStatementState, + public OutputStatementState { +public: + InquireIOLengthState(const char *sourceFile = nullptr, int sourceLine = 0); + std::size_t bytes() const { return bytes_; } + bool Emit(const char *, std::size_t, std::size_t elementBytes = 0); + +private: + std::size_t bytes_{0}; +}; + class ExternalMiscIoStatementState : public ExternalIoStatementBase { public: enum Which { Flush, Backspace, Endfile, Rewind }; diff --git a/flang/runtime/memory.h b/flang/runtime/memory.h index f21b237f3905b..4b09fe80772ed 100644 --- a/flang/runtime/memory.h +++ b/flang/runtime/memory.h @@ -42,7 +42,7 @@ template class SizedNew { public: explicit SizedNew(const Terminator &terminator) : terminator_{terminator} {} template - [[nodiscard]] OwningPtr operator()(std::size_t bytes, X &&... x) { + [[nodiscard]] OwningPtr operator()(std::size_t bytes, X &&...x) { return OwningPtr{new (AllocateMemoryOrCrash(terminator_, bytes)) A{std::forward(x)...}}; } @@ -53,7 +53,7 @@ template class SizedNew { template struct New : public SizedNew { using SizedNew::SizedNew; - template [[nodiscard]] OwningPtr operator()(X &&... x) { + template [[nodiscard]] OwningPtr operator()(X &&...x) { return SizedNew::operator()(sizeof(A), std::forward(x)...); } }; diff --git a/flang/runtime/tools.cpp b/flang/runtime/tools.cpp index ea9ad9063344b..219daaf2880b1 100644 --- a/flang/runtime/tools.cpp +++ b/flang/runtime/tools.cpp @@ -12,6 +12,13 @@ namespace Fortran::runtime { +std::size_t TrimTrailingSpaces(const char *s, std::size_t n) { + while (n > 0 && s[n - 1] == ' ') { + --n; + } + return n; +} + OwningPtr SaveDefaultCharacter( const char *s, std::size_t length, const Terminator &terminator) { if (s) { diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h index fad19f607c683..6c5eb63cc8c11 100644 --- a/flang/runtime/tools.h +++ b/flang/runtime/tools.h @@ -18,6 +18,8 @@ namespace Fortran::runtime { class Terminator; +std::size_t TrimTrailingSpaces(const char *, std::size_t); + OwningPtr SaveDefaultCharacter( const char *, std::size_t, const Terminator &); diff --git a/flang/runtime/unit-map.cpp b/flang/runtime/unit-map.cpp index 905beb4d084fa..1cd2115f4aa1b 100644 --- a/flang/runtime/unit-map.cpp +++ b/flang/runtime/unit-map.cpp @@ -72,6 +72,20 @@ void UnitMap::FlushAll(IoErrorHandler &handler) { } } +ExternalFileUnit *UnitMap::Find(const char *path) { + if (path) { + // TODO: Faster data structure + for (int j{0}; j < buckets_; ++j) { + for (Chain *p{bucket_[j].get()}; p; p = p->next.get()) { + if (p->unit.path() && std::strcmp(p->unit.path(), path) == 0) { + return &p->unit; + } + } + } + } + return nullptr; +} + ExternalFileUnit &UnitMap::Create(int n, const Terminator &terminator) { Chain &chain{*New{terminator}(n).release()}; chain.next.reset(&chain); diff --git a/flang/runtime/unit-map.h b/flang/runtime/unit-map.h index be244f5ae463a..961962a2d635c 100644 --- a/flang/runtime/unit-map.h +++ b/flang/runtime/unit-map.h @@ -34,6 +34,12 @@ class UnitMap { return p ? *p : Create(n, terminator); } + // Unit look-up by name is needed for INQUIRE(FILE="...") + ExternalFileUnit *LookUp(const char *path) { + CriticalSection critical{lock_}; + return Find(path); + } + ExternalFileUnit &NewUnit(const Terminator &terminator) { CriticalSection critical{lock_}; return Create(nextNewUnit_--, terminator); @@ -72,6 +78,7 @@ class UnitMap { } return nullptr; } + ExternalFileUnit *Find(const char *path); ExternalFileUnit &Create(int, const Terminator &); diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp index a4c69df8d6a9a..be36666f66e46 100644 --- a/flang/runtime/unit.cpp +++ b/flang/runtime/unit.cpp @@ -59,20 +59,19 @@ ExternalFileUnit &ExternalFileUnit::LookUpOrCreateAnonymous( ExternalFileUnit &result{ GetUnitMap().LookUpOrCreate(unit, terminator, exists)}; if (!exists) { - // I/O to an unconnected unit reads/creates a local file, e.g. fort.7 - std::size_t pathMaxLen{32}; - auto path{SizedNew{terminator}(pathMaxLen)}; - std::snprintf(path.get(), pathMaxLen, "fort.%d", unit); IoErrorHandler handler{terminator}; - result.OpenUnit( - dir == Direction::Input ? OpenStatus::Old : OpenStatus::Replace, - Action::ReadWrite, Position::Rewind, std::move(path), - std::strlen(path.get()), Convert::Native, handler); + result.OpenAnonymousUnit( + dir == Direction::Input ? OpenStatus::Unknown : OpenStatus::Replace, + Action::ReadWrite, Position::Rewind, Convert::Native, handler); result.isUnformatted = isUnformatted; } return result; } +ExternalFileUnit *ExternalFileUnit::LookUp(const char *path) { + return GetUnitMap().LookUp(path); +} + ExternalFileUnit &ExternalFileUnit::CreateNew( int unit, const Terminator &terminator) { bool wasExtant{false}; @@ -125,10 +124,7 @@ void ExternalFileUnit::OpenUnit(OpenStatus status, std::optional action, handler.SignalError(IostatOpenBadRecl, "OPEN(UNIT=%d,ACCESS='DIRECT',RECL=%jd): record length is invalid", unitNumber(), static_cast(*recordLength)); - } else if (!totalBytes) { - handler.SignalError(IostatOpenUnknownSize, - "OPEN(UNIT=%d,ACCESS='DIRECT'): file size is not known"); - } else if (*totalBytes % *recordLength != 0) { + } else if (totalBytes && (*totalBytes % *recordLength != 0)) { handler.SignalError(IostatOpenBadAppend, "OPEN(UNIT=%d,ACCESS='DIRECT',RECL=%jd): record length is not an " "even divisor of the file size %jd", @@ -137,7 +133,7 @@ void ExternalFileUnit::OpenUnit(OpenStatus status, std::optional action, } } if (position == Position::Append) { - if (*totalBytes && recordLength && *recordLength) { + if (totalBytes && recordLength && *recordLength) { endfileRecordNumber = 1 + (*totalBytes / *recordLength); } else { // Fake it so that we can backspace relative from the end @@ -149,6 +145,17 @@ void ExternalFileUnit::OpenUnit(OpenStatus status, std::optional action, } } +void ExternalFileUnit::OpenAnonymousUnit(OpenStatus status, + std::optional action, Position position, Convert convert, + IoErrorHandler &handler) { + // I/O to an unconnected unit reads/creates a local file, e.g. fort.7 + std::size_t pathMaxLen{32}; + auto path{SizedNew{handler}(pathMaxLen)}; + std::snprintf(path.get(), pathMaxLen, "fort.%d", unitNumber_); + OpenUnit(status, action, position, std::move(path), std::strlen(path.get()), + convert, handler); +} + void ExternalFileUnit::CloseUnit(CloseStatus status, IoErrorHandler &handler) { DoImpliedEndfile(handler); Flush(handler); diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h index f94e4229cd4c8..9d66d962bc56d 100644 --- a/flang/runtime/unit.h +++ b/flang/runtime/unit.h @@ -35,6 +35,7 @@ class ExternalFileUnit : public ConnectionState, public: explicit ExternalFileUnit(int unitNumber) : unitNumber_{unitNumber} {} int unitNumber() const { return unitNumber_; } + bool swapEndianness() const { return swapEndianness_; } static ExternalFileUnit *LookUp(int unit); static ExternalFileUnit &LookUpOrCrash(int unit, const Terminator &); @@ -42,6 +43,7 @@ class ExternalFileUnit : public ConnectionState, int unit, const Terminator &, bool &wasExtant); static ExternalFileUnit &LookUpOrCreateAnonymous( int unit, Direction, bool isUnformatted, const Terminator &); + static ExternalFileUnit *LookUp(const char *path); static ExternalFileUnit &CreateNew(int unit, const Terminator &); static ExternalFileUnit *LookUpForClose(int unit); static int NewUnit(const Terminator &); @@ -51,13 +53,15 @@ class ExternalFileUnit : public ConnectionState, void OpenUnit(OpenStatus, std::optional, Position, OwningPtr &&path, std::size_t pathLength, Convert, IoErrorHandler &); + void OpenAnonymousUnit( + OpenStatus, std::optional, Position, Convert, IoErrorHandler &); void CloseUnit(CloseStatus, IoErrorHandler &); void DestroyClosed(); bool SetDirection(Direction, IoErrorHandler &); template - IoStatementState &BeginIoStatement(X &&... xs) { + IoStatementState &BeginIoStatement(X &&...xs) { // TODO: Child data transfer statements vs. locking lock_.Take(); // dropped in EndIoStatement() A &state{u_.emplace(std::forward(xs)...)}; @@ -111,7 +115,7 @@ class ExternalFileUnit : public ConnectionState, ExternalListIoStatementState, ExternalListIoStatementState, UnformattedIoStatementState, - UnformattedIoStatementState, + UnformattedIoStatementState, InquireUnitState, ExternalMiscIoStatementState> u_; diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt index ad47fff8c4250..7da1d94d84c4e 100644 --- a/flang/test/CMakeLists.txt +++ b/flang/test/CMakeLists.txt @@ -19,6 +19,13 @@ configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py ) +configure_lit_site_cfg( + ${CMAKE_CURRENT_SOURCE_DIR}/NonGtestUnit/lit.site.cfg.py.in + ${CMAKE_CURRENT_BINARY_DIR}/NonGtestUnit/lit.site.cfg.py + MAIN_CONFIG + ${CMAKE_CURRENT_SOURCE_DIR}/NonGtestUnit/lit.cfg.py +) + set(FLANG_TEST_PARAMS flang_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py) diff --git a/flang/test/Driver/Inputs/hello.f90 b/flang/test/Driver/Inputs/hello.f90 new file mode 100644 index 0000000000000..d0c7eb94f53c8 --- /dev/null +++ b/flang/test/Driver/Inputs/hello.f90 @@ -0,0 +1,3 @@ +program hello + write (*,*), "hello world" +end program hello diff --git a/flang/test/Driver/help.f90 b/flang/test/Driver/help.f90 new file mode 100644 index 0000000000000..d6162954a8723 --- /dev/null +++ b/flang/test/Driver/help.f90 @@ -0,0 +1,10 @@ +! RUN: %f18 -h 2>&1 | FileCheck %s +! RUN: %f18 -help 2>&1 | FileCheck %s +! RUN: %f18 --help 2>&1 | FileCheck %s +! RUN: %f18 -? 2>&1 | FileCheck %s + +! CHECK: f18: LLVM Fortran compiler + +! CHECK: -help print this again +! CHECK: Unrecognised options are passed through to the external compiler +! CHECK: set by F18_FC (see defaults). diff --git a/flang/test/Driver/no_files.f90 b/flang/test/Driver/no_files.f90 new file mode 100644 index 0000000000000..718985dce4ca5 --- /dev/null +++ b/flang/test/Driver/no_files.f90 @@ -0,0 +1,10 @@ +! RUN: %f18 < %S/Inputs/hello.f90 | FileCheck %s + + +! CHECK: Enter Fortran source +! CHECK: Use EOF character (^D) to end file + +! CHECK: Parse tree comprises {{.*}} objects and occupies {{.*}} total bytes +! CHECK: PROGRAM hello +! CHECK: WRITE (*, *) "hello world" +! CHECK: END PROGRAM hello diff --git a/flang/test/Evaluate/folding01.f90 b/flang/test/Evaluate/folding01.f90 index 09004649e9d50..465b22752cecb 100644 --- a/flang/test/Evaluate/folding01.f90 +++ b/flang/test/Evaluate/folding01.f90 @@ -63,6 +63,14 @@ module m logical, parameter :: test_ne_i1 =.NOT.(2.NE.2) logical, parameter :: test_ne_i2 = -2.NE.2 +! Check conversions + logical, parameter :: test_cmplx1 = cmplx((1._4, -1._4)).EQ.((1._4, -1._4)) + logical, parameter :: test_cmplx2 = cmplx((1._4, -1._4), 8).EQ.((1._8, -1._8)) + logical, parameter :: test_cmplx3 = cmplx(1._4, -1._4).EQ.((1._4, -1._4)) + logical, parameter :: test_cmplx4 = cmplx(1._4, -1._4, 8).EQ.((1._8, -1._8)) + logical, parameter :: test_cmplx5 = cmplx(1._4).EQ.((1._4, 0._4)) + logical, parameter :: test_cmplx6 = cmplx(1._4, kind=8).EQ.((1._8, 0._8)) + ! Check integer intrinsic operation folding logical, parameter :: test_unaryminus_i = (-(-1)).EQ.1 logical, parameter :: test_unaryplus_i = (+1).EQ.1 diff --git a/flang/test/NonGtestUnit/lit.cfg.py b/flang/test/NonGtestUnit/lit.cfg.py new file mode 100644 index 0000000000000..7f53f861bc65c --- /dev/null +++ b/flang/test/NonGtestUnit/lit.cfg.py @@ -0,0 +1,16 @@ +import os + +import lit.Test + +config.name = 'flang-OldUnit' + +config.suffixes = [".test"] + +config.test_source_root = os.path.join(config.flang_obj_root, 'unittests') +config.test_exec_root = config.test_source_root + +config.test_format = lit.formats.ExecutableTest() + +path = os.path.pathsep.join((config.flang_libs_dir, config.llvm_libs_dir, + config.environment.get('LD_LIBRARY_PATH',''))) +config.environment['LD_LIBRARY_PATH'] = path diff --git a/flang/test/NonGtestUnit/lit.site.cfg.py.in b/flang/test/NonGtestUnit/lit.site.cfg.py.in new file mode 100644 index 0000000000000..3218fe0b5ce3d --- /dev/null +++ b/flang/test/NonGtestUnit/lit.site.cfg.py.in @@ -0,0 +1,27 @@ +@LIT_SITE_CFG_IN_HEADER@ + +config.llvm_src_root = "@LLVM_SOURCE_DIR@" +config.llvm_obj_root = "@LLVM_BINARY_DIR@" +config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" +config.llvm_libs_dir = "@LLVM_LIBS_DIR@" +config.llvm_build_mode = "@LLVM_BUILD_MODE@" +config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" +config.flang_obj_root = "@FLANG_BINARY_DIR@" +config.flang_src_root = "@FLANG_SOURCE_DIR@" +config.flang_libs_dir = "@LLVM_LIBRARY_OUTPUT_INTDIR@" +config.flang_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" +config.target_triple = "@TARGET_TRIPLE@" +config.python_executable = "@Python3_EXECUTABLE@" + +# Support substitution of the tools and libs dirs with user parameters. This is +# used when we can't determine the tool dir at configuration time. +try: + config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params + config.llvm_libs_dir = config.llvm_libs_dir % lit_config.params + config.llvm_build_mode = config.llvm_build_mode % lit_config.params +except KeyError as e: + key, = e.args + lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key)) + +# Let the main config do the real work. +lit_config.load_config(config, "@FLANG_SOURCE_DIR@/test/NonGtestUnit/lit.cfg.py") diff --git a/flang/test/Preprocessing/lit.local.cfg.py b/flang/test/Preprocessing/lit.local.cfg.py deleted file mode 100644 index a7cf401d8c66c..0000000000000 --- a/flang/test/Preprocessing/lit.local.cfg.py +++ /dev/null @@ -1,7 +0,0 @@ -# -*- Python -*- - -from lit.llvm import llvm_config - -# Added this line file to prevent lit from discovering these tests -# See Issue #1052 -config.suffixes = [] diff --git a/flang/test/Preprocessing/pp001.F b/flang/test/Preprocessing/pp001.F index ba131b4a24c6f..95198641655a5 100644 --- a/flang/test/Preprocessing/pp001.F +++ b/flang/test/Preprocessing/pp001.F @@ -1,9 +1,11 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777.eq.777)then * keyword macros integer, parameter :: KWM = 666 #define KWM 777 if (KWM .eq. 777) then - print *, 'pp001.F pass' + print *, 'pp001.F yes' else - print *, 'pp001.F FAIL: ', KWM + print *, 'pp001.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp002.F b/flang/test/Preprocessing/pp002.F index f46baf73e2894..b4cbd7545adf8 100644 --- a/flang/test/Preprocessing/pp002.F +++ b/flang/test/Preprocessing/pp002.F @@ -1,10 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(kwm.eq.777)then * #undef integer, parameter :: KWM = 777 #define KWM 666 #undef KWM if (KWM .eq. 777) then - print *, 'pp002.F pass' + print *, 'pp002.F yes' else - print *, 'pp002.F FAIL: ', KWM + print *, 'pp002.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp003.F b/flang/test/Preprocessing/pp003.F index 0470f1909a696..4df10c47d2359 100644 --- a/flang/test/Preprocessing/pp003.F +++ b/flang/test/Preprocessing/pp003.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * function-like macros integer function IFLM(x) integer :: x @@ -8,8 +10,8 @@ program main integer :: res res = IFLM(666) if (res .eq. 777) then - print *, 'pp003.F pass' + print *, 'pp003.F yes' else - print *, 'pp003.F FAIL: ', res + print *, 'pp003.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp004.F b/flang/test/Preprocessing/pp004.F index 800a96fbedbaf..788d3682ab98a 100644 --- a/flang/test/Preprocessing/pp004.F +++ b/flang/test/Preprocessing/pp004.F @@ -1,9 +1,11 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(kwm.eq.777)then * KWMs case-sensitive integer, parameter :: KWM = 777 #define KWM 666 if (kwm .eq. 777) then - print *, 'pp004.F pass' + print *, 'pp004.F yes' else - print *, 'pp004.F FAIL: ', kwm + print *, 'pp004.F no: ', kwm end if end diff --git a/flang/test/Preprocessing/pp005.F b/flang/test/Preprocessing/pp005.F index 05fab7a92f1b6..390e662e35574 100644 --- a/flang/test/Preprocessing/pp005.F +++ b/flang/test/Preprocessing/pp005.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=777 * KWM split across continuation, implicit padding integer, parameter :: KWM = 666 #define KWM 777 @@ -5,8 +7,8 @@ res = KW +M if (res .eq. 777) then - print *, 'pp005.F pass' + print *, 'pp005.F yes' else - print *, 'pp005.F FAIL: ', res + print *, 'pp005.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp006.F b/flang/test/Preprocessing/pp006.F index 55b87df8d4304..3c44728f56b50 100644 --- a/flang/test/Preprocessing/pp006.F +++ b/flang/test/Preprocessing/pp006.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=777 * ditto, but with intervening *comment line integer, parameter :: KWM = 666 #define KWM 777 @@ -6,8 +8,8 @@ *comment +M if (res .eq. 777) then - print *, 'pp006.F pass' + print *, 'pp006.F yes' else - print *, 'pp006.F FAIL: ', res + print *, 'pp006.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp007.F b/flang/test/Preprocessing/pp007.F index 8be4396a24928..ca78f4f4c6c5c 100644 --- a/flang/test/Preprocessing/pp007.F +++ b/flang/test/Preprocessing/pp007.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=kwm * KWM split across continuation, clipped after column 72 integer, parameter :: KWM = 666 #define KWM 777 @@ -8,8 +10,8 @@ res = KW comment +M if (res .eq. 777) then - print *, 'pp007.F pass' + print *, 'pp007.F yes' else - print *, 'pp007.F FAIL: ', res + print *, 'pp007.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp008.F b/flang/test/Preprocessing/pp008.F index 38c5b6657a871..9dd7bd92ea50f 100644 --- a/flang/test/Preprocessing/pp008.F +++ b/flang/test/Preprocessing/pp008.F @@ -1,11 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=kwm * KWM with spaces in name at invocation NOT replaced integer, parameter :: KWM = 777 #define KWM 666 integer :: res res = K W M if (res .eq. 777) then - print *, 'pp008.F pass' + print *, 'pp008.F yes' else - print *, 'pp008.F FAIL: ', res + print *, 'pp008.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp009.F b/flang/test/Preprocessing/pp009.F index a53623ff03692..c29021da484aa 100644 --- a/flang/test/Preprocessing/pp009.F +++ b/flang/test/Preprocessing/pp009.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * FLM call split across continuation, implicit padding integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFL +M(666) if (res .eq. 777) then - print *, 'pp009.F pass' + print *, 'pp009.F yes' else - print *, 'pp009.F FAIL: ', res + print *, 'pp009.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp010.F b/flang/test/Preprocessing/pp010.F index 0769c98274dd3..4a812cde86af0 100644 --- a/flang/test/Preprocessing/pp010.F +++ b/flang/test/Preprocessing/pp010.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * ditto, but with intervening *comment line integer function IFLM(x) integer :: x @@ -10,8 +12,8 @@ program main *comment +M(666) if (res .eq. 777) then - print *, 'pp010.F pass' + print *, 'pp010.F yes' else - print *, 'pp010.F FAIL: ', res + print *, 'pp010.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp011.F b/flang/test/Preprocessing/pp011.F index 4ec376649422b..c106c8bc3f5c5 100644 --- a/flang/test/Preprocessing/pp011.F +++ b/flang/test/Preprocessing/pp011.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=iflm(666) * FLM call name split across continuation, clipped integer function IFLM(x) integer :: x @@ -12,8 +14,8 @@ program main res = IFL comment +M(666) if (res .eq. 777) then - print *, 'pp011.F pass' + print *, 'pp011.F yes' else - print *, 'pp011.F FAIL: ', res + print *, 'pp011.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp012.F b/flang/test/Preprocessing/pp012.F index 703fabf7d8db6..411cfb887bb1f 100644 --- a/flang/test/Preprocessing/pp012.F +++ b/flang/test/Preprocessing/pp012.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * FLM call name split across continuation integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFL +M(666) if (res .eq. 777) then - print *, 'pp012.F pass' + print *, 'pp012.F yes' else - print *, 'pp012.F FAIL: ', res + print *, 'pp012.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp013.F b/flang/test/Preprocessing/pp013.F index 6fb8ca75b5c83..f05e2e30fee69 100644 --- a/flang/test/Preprocessing/pp013.F +++ b/flang/test/Preprocessing/pp013.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * FLM call split between name and ( integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFLM +(666) if (res .eq. 777) then - print *, 'pp013.F pass' + print *, 'pp013.F yes' else - print *, 'pp013.F FAIL: ', res + print *, 'pp013.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp014.F b/flang/test/Preprocessing/pp014.F index 397a31d1a8aaa..470966096e4ee 100644 --- a/flang/test/Preprocessing/pp014.F +++ b/flang/test/Preprocessing/pp014.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * FLM call split between name and (, with intervening *comment integer function IFLM(x) integer :: x @@ -10,8 +12,8 @@ program main *comment +(666) if (res .eq. 777) then - print *, 'pp014.F pass' + print *, 'pp014.F yes' else - print *, 'pp014.F FAIL: ', res + print *, 'pp014.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp015.F b/flang/test/Preprocessing/pp015.F index 4c399a835567e..8320bd704fed9 100644 --- a/flang/test/Preprocessing/pp015.F +++ b/flang/test/Preprocessing/pp015.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * FLM call split between name and (, clipped integer function IFLM(x) integer :: x @@ -12,8 +14,8 @@ program main res = IFLM comment +(666) if (res .eq. 777) then - print *, 'pp015.F pass' + print *, 'pp015.F yes' else - print *, 'pp015.F FAIL: ', res + print *, 'pp015.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp016.F b/flang/test/Preprocessing/pp016.F index 210ad0b3fddbf..95c89f8e1a03e 100644 --- a/flang/test/Preprocessing/pp016.F +++ b/flang/test/Preprocessing/pp016.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * FLM call split between name and ( and in argument integer function IFLM(x) integer :: x @@ -10,8 +12,8 @@ program main +(66 +6) if (res .eq. 777) then - print *, 'pp016.F pass' + print *, 'pp016.F yes' else - print *, 'pp016.F FAIL: ', res + print *, 'pp016.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp017.F b/flang/test/Preprocessing/pp017.F index e658fbd922cf4..a11f185147831 100644 --- a/flang/test/Preprocessing/pp017.F +++ b/flang/test/Preprocessing/pp017.F @@ -1,10 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777.eq.777)then * KLM rescan integer, parameter :: KWM = 666, KWM2 = 667 #define KWM2 777 #define KWM KWM2 if (KWM .eq. 777) then - print *, 'pp017.F pass' + print *, 'pp017.F yes' else - print *, 'pp017.F FAIL: ', KWM + print *, 'pp017.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp018.F b/flang/test/Preprocessing/pp018.F index 877c6545e1c82..69c24e59f53d6 100644 --- a/flang/test/Preprocessing/pp018.F +++ b/flang/test/Preprocessing/pp018.F @@ -1,11 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(kwm2.eq.777)then * KLM rescan with #undef (so rescan is after expansion) integer, parameter :: KWM2 = 777, KWM = 667 #define KWM2 666 #define KWM KWM2 #undef KWM2 if (KWM .eq. 777) then - print *, 'pp018.F pass' + print *, 'pp018.F yes' else - print *, 'pp018.F FAIL: ', KWM + print *, 'pp018.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp019.F b/flang/test/Preprocessing/pp019.F index a2c9a02848575..d607ad3484079 100644 --- a/flang/test/Preprocessing/pp019.F +++ b/flang/test/Preprocessing/pp019.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * FLM rescan integer function IFLM(x) integer :: x @@ -10,8 +12,8 @@ program main integer :: res res = IFLM(666) if (res .eq. 777) then - print *, 'pp019.F pass' + print *, 'pp019.F yes' else - print *, 'pp019.F FAIL: ', res + print *, 'pp019.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp020.F b/flang/test/Preprocessing/pp020.F index f0d26357c5d2f..88525003ba893 100644 --- a/flang/test/Preprocessing/pp020.F +++ b/flang/test/Preprocessing/pp020.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((111)+666) * FLM expansion of argument integer function IFLM(x) integer :: x @@ -10,8 +12,8 @@ program main integer :: res res = IFLM(KWM) if (res .eq. 777) then - print *, 'pp020.F pass' + print *, 'pp020.F yes' else - print *, 'pp020.F FAIL: ', res + print *, 'pp020.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp021.F b/flang/test/Preprocessing/pp021.F index 45073ab6f1e6c..1662a680f5aa9 100644 --- a/flang/test/Preprocessing/pp021.F +++ b/flang/test/Preprocessing/pp021.F @@ -1,10 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: ch='KWM' +! CHECK: if(ch.eq.'KWM')then * KWM NOT expanded in 'literal' #define KWM 666 character(len=3) :: ch ch = 'KWM' if (ch .eq. 'KWM') then - print *, 'pp021.F pass' + print *, 'pp021.F yes' else - print *, 'pp021.F FAIL: ', ch + print *, 'pp021.F no: ', ch end if end diff --git a/flang/test/Preprocessing/pp022.F b/flang/test/Preprocessing/pp022.F index e9a1e8ba4b915..026c02e9d4f01 100644 --- a/flang/test/Preprocessing/pp022.F +++ b/flang/test/Preprocessing/pp022.F @@ -1,10 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: ch="KWM" +! CHECK: if(ch.eq.'KWM')then * KWM NOT expanded in "literal" #define KWM 666 character(len=3) :: ch ch = "KWM" if (ch .eq. 'KWM') then - print *, 'pp022.F pass' + print *, 'pp022.F yes' else - print *, 'pp022.F FAIL: ', ch + print *, 'pp022.F no: ', ch end if end diff --git a/flang/test/Preprocessing/pp023.F b/flang/test/Preprocessing/pp023.F index fb63d63f4fc1e..a0d053f6addc2 100644 --- a/flang/test/Preprocessing/pp023.F +++ b/flang/test/Preprocessing/pp023.F @@ -1,11 +1,14 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: ch=3hKWM +! CHECK: if(ch.eq.'KWM')then * KWM NOT expanded in 9HHOLLERITH literal #define KWM 666 #define HKWM 667 character(len=3) :: ch ch = 3HKWM if (ch .eq. 'KWM') then - print *, 'pp023.F pass' + print *, 'pp023.F yes' else - print *, 'pp023.F FAIL: ', ch + print *, 'pp023.F no: ', ch end if end diff --git a/flang/test/Preprocessing/pp024.F b/flang/test/Preprocessing/pp024.F index 9072f6e50cc89..6ea76cc9c24de 100644 --- a/flang/test/Preprocessing/pp024.F +++ b/flang/test/Preprocessing/pp024.F @@ -1,3 +1,6 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: 100format(3hKWM) +! CHECK: if(ch.eq.'KWM')then * KWM NOT expanded in Hollerith in FORMAT #define KWM 666 #define HKWM 667 @@ -5,8 +8,8 @@ 100 format(3HKWM) write(ch, 100) if (ch .eq. 'KWM') then - print *, 'pp024.F pass' + print *, 'pp024.F yes' else - print *, 'pp024.F FAIL: ', ch + print *, 'pp024.F no: ', ch end if end diff --git a/flang/test/Preprocessing/pp025.F b/flang/test/Preprocessing/pp025.F index 42ad011842ff7..49521d443bd3b 100644 --- a/flang/test/Preprocessing/pp025.F +++ b/flang/test/Preprocessing/pp025.F @@ -1,11 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=ikwm2z * KWM expansion is before token pasting due to fixed-form space removal integer, parameter :: IKWM2Z = 777 #define KWM KWM2 integer :: res res = I KWM Z if (res .eq. 777) then - print *, 'pp025.F pass' + print *, 'pp025.F yes' else - print *, 'pp025.F FAIL: ', res + print *, 'pp025.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp026.F b/flang/test/Preprocessing/pp026.F index e0ea032c383fc..b551f3b173ed3 100644 --- a/flang/test/Preprocessing/pp026.F +++ b/flang/test/Preprocessing/pp026.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((111)+666) * ## token pasting works in FLM integer function IFLM(x) integer :: x @@ -12,8 +14,8 @@ program main integer :: res res = IFLM(KWM) if (res .eq. 777) then - print *, 'pp026.F pass' + print *, 'pp026.F yes' else - print *, 'pp026.F FAIL: ', res + print *, 'pp026.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp027.F b/flang/test/Preprocessing/pp027.F index e2663800c1ce3..c1b787f9366d3 100644 --- a/flang/test/Preprocessing/pp027.F +++ b/flang/test/Preprocessing/pp027.F @@ -1,9 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: kwm=666 +! CHECK: if(777.eq.777)then * #DEFINE works in fixed form integer, parameter :: KWM = 666 #DEFINE KWM 777 if (KWM .eq. 777) then - print *, 'pp027.F pass' + print *, 'pp027.F yes' else - print *, 'pp027.F FAIL: ', KWM + print *, 'pp027.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp028.F b/flang/test/Preprocessing/pp028.F index 2906d389b57d4..51eab84f36877 100644 --- a/flang/test/Preprocessing/pp028.F +++ b/flang/test/Preprocessing/pp028.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=kw * fixed-form clipping done before KWM expansion on source line integer, parameter :: KW = 777 #define KWM 666 @@ -7,8 +9,8 @@ *234567890123456789012345678901234567890123456789012345678901234567890123 res = KWM if (res .eq. 777) then - print *, 'pp028.F pass' + print *, 'pp028.F yes' else - print *, 'pp028.F FAIL: ', res + print *, 'pp028.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp029.F b/flang/test/Preprocessing/pp029.F index 4374ef84489b5..bb8efe6c1a2e0 100644 --- a/flang/test/Preprocessing/pp029.F +++ b/flang/test/Preprocessing/pp029.F @@ -1,10 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(77 7.eq.777)then * \ newline allowed in #define integer, parameter :: KWM = 666 #define KWM 77\ 7 if (KWM .eq. 777) then - print *, 'pp029.F pass' + print *, 'pp029.F yes' else - print *, 'pp029.F FAIL: ', KWM + print *, 'pp029.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp030.F b/flang/test/Preprocessing/pp030.F index 3022e0ddf3dff..c04cf949f4142 100644 --- a/flang/test/Preprocessing/pp030.F +++ b/flang/test/Preprocessing/pp030.F @@ -1,9 +1,11 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777.eq.777)then * /* C comment */ erased from #define integer, parameter :: KWM = 666 #define KWM 777 /* C comment */ if (KWM .eq. 777) then - print *, 'pp030.F pass' + print *, 'pp030.F yes' else - print *, 'pp030.F FAIL: ', KWM + print *, 'pp030.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp031.F b/flang/test/Preprocessing/pp031.F index 0f59921bcb820..90b14647c4c93 100644 --- a/flang/test/Preprocessing/pp031.F +++ b/flang/test/Preprocessing/pp031.F @@ -1,9 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777//ccomment.eq.777)then +! CHECK: print*,'pp031.F no: ',777//ccomment * // C++ comment NOT erased from #define integer, parameter :: KWM = 666 #define KWM 777 // C comment if (KWM .eq. 777) then print *, 'pp031.F FAIL (should not have compiled)' else - print *, 'pp031.F FAIL: ', KWM + print *, 'pp031.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp032.F b/flang/test/Preprocessing/pp032.F index 9d9f14238d86d..6b779141ecb56 100644 --- a/flang/test/Preprocessing/pp032.F +++ b/flang/test/Preprocessing/pp032.F @@ -1,10 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777.eq.777)then +! CHECK: print*,'pp032.F no: ',777 * /* C comment */ \ newline erased from #define integer, parameter :: KWM = 666 #define KWM 77/* C comment */\ 7 if (KWM .eq. 777) then - print *, 'pp032.F pass' + print *, 'pp032.F yes' else - print *, 'pp032.F FAIL: ', KWM + print *, 'pp032.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp033.F b/flang/test/Preprocessing/pp033.F index 34cf1996cc74d..3364527cba6a3 100644 --- a/flang/test/Preprocessing/pp033.F +++ b/flang/test/Preprocessing/pp033.F @@ -1,10 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777.eq.777)then +! CHECK: print*,'pp033.F no: ',777 * /* C comment \ newline */ erased from #define integer, parameter :: KWM = 666 #define KWM 77/* C comment \ */7 if (KWM .eq. 777) then - print *, 'pp033.F pass' + print *, 'pp033.F yes' else - print *, 'pp033.F FAIL: ', KWM + print *, 'pp033.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp034.F b/flang/test/Preprocessing/pp034.F index a9ed984b3b6ec..0c64aca62b1db 100644 --- a/flang/test/Preprocessing/pp034.F +++ b/flang/test/Preprocessing/pp034.F @@ -1,10 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777.eq.777)then +! CHECK: print*,'pp034.F no: ',777 * \ newline allowed in name on KWM definition integer, parameter :: KWMC = 666 #define KWM\ C 777 if (KWMC .eq. 777) then - print *, 'pp034.F pass' + print *, 'pp034.F yes' else - print *, 'pp034.F FAIL: ', KWMC + print *, 'pp034.F no: ', KWMC end if end diff --git a/flang/test/Preprocessing/pp035.F b/flang/test/Preprocessing/pp035.F index 0135c9c4551a7..808174129b897 100644 --- a/flang/test/Preprocessing/pp035.F +++ b/flang/test/Preprocessing/pp035.F @@ -1,3 +1,6 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777.eq.777)then +! CHECK: print*,'pp035.F no: ',777 * #if 2 .LT. 3 works integer, parameter :: KWM = 666 #if 2 .LT. 3 @@ -6,8 +9,8 @@ #define KWM 667 #endif if (KWM .eq. 777) then - print *, 'pp035.F pass' + print *, 'pp035.F yes' else - print *, 'pp035.F FAIL: ', KWM + print *, 'pp035.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp036.F b/flang/test/Preprocessing/pp036.F index ac922ae42ceb5..b7024c41fa60f 100644 --- a/flang/test/Preprocessing/pp036.F +++ b/flang/test/Preprocessing/pp036.F @@ -1,8 +1,11 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(.true.)then +! CHECK: print*,'pp036.F no: ',.true. * #define FALSE TRUE ... .FALSE. -> .TRUE. #define FALSE TRUE if (.FALSE.) then - print *, 'pp036.F pass' + print *, 'pp036.F yes' else - print *, 'pp036.F FAIL: ', .FALSE. + print *, 'pp036.F no: ', .FALSE. end if end diff --git a/flang/test/Preprocessing/pp037.F b/flang/test/Preprocessing/pp037.F index 6c3edb09eb6f4..52bfa8ffdab3d 100644 --- a/flang/test/Preprocessing/pp037.F +++ b/flang/test/Preprocessing/pp037.F @@ -1,11 +1,14 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(7777.eq.777)then +! CHECK: print*,'pp037.F no: ',7777 * fixed-form clipping NOT applied to #define integer, parameter :: KWM = 666 * 1 2 3 4 5 6 7 *234567890123456789012345678901234567890123456789012345678901234567890123 #define KWM 7777 if (KWM .eq. 777) then - print *, 'pp037.F pass' + print *, 'pp037.F yes' else - print *, 'pp037.F FAIL: ', KWM + print *, 'pp037.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp038.F b/flang/test/Preprocessing/pp038.F index 3c83dda7d03b7..7386aeebddeef 100644 --- a/flang/test/Preprocessing/pp038.F +++ b/flang/test/Preprocessing/pp038.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=((666)+111) * FLM call with closing ')' on next line (not a continuation) integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFLM(666 ) if (res .eq. 777) then - print *, 'pp038.F pass' + print *, 'pp038.F yes' else - print *, 'pp038.F FAIL: ', res + print *, 'pp038.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp039.F b/flang/test/Preprocessing/pp039.F index 52e6dd78603a0..9124474b86ba3 100644 --- a/flang/test/Preprocessing/pp039.F +++ b/flang/test/Preprocessing/pp039.F @@ -1,3 +1,7 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res=iflm +! CHECK: (666) +! CHECK-NOT: res=((666)+111) * FLM call with '(' on next line (not a continuation) integer function IFLM(x) integer :: x @@ -9,8 +13,8 @@ program main res = IFLM (666) if (res .eq. 777) then - print *, 'pp039.F pass' + print *, 'pp039.F yes' else - print *, 'pp039.F FAIL: ', res + print *, 'pp039.F no: ', res end if end diff --git a/flang/test/Preprocessing/pp040.F b/flang/test/Preprocessing/pp040.F index 59e901ac3c6b6..d589c38489cfb 100644 --- a/flang/test/Preprocessing/pp040.F +++ b/flang/test/Preprocessing/pp040.F @@ -1,5 +1,7 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK-NOT: FAIL HARD! * #define KWM c, then KWM works as comment line initiator #define KWM c KWM print *, 'pp040.F FAIL HARD!'; stop - print *, 'pp040.F pass' + print *, 'pp040.F yes' end diff --git a/flang/test/Preprocessing/pp041.F b/flang/test/Preprocessing/pp041.F index 33c5ced3924dc..dcf61536040ba 100644 --- a/flang/test/Preprocessing/pp041.F +++ b/flang/test/Preprocessing/pp041.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: j=666wmj=j+1wm211 * use KWM expansion as continuation indicators #define KWM 0 #define KWM2 1 @@ -6,8 +8,8 @@ KWM j = j + 1 KWM2 11 if (j .eq. 777) then - print *, 'pp041.F pass' + print *, 'pp041.F yes' else - print *, 'pp041.F FAIL', j + print *, 'pp041.F no', j end if end diff --git a/flang/test/Preprocessing/pp042.F b/flang/test/Preprocessing/pp042.F index 439e1affbca22..d5cc46a7a4712 100644 --- a/flang/test/Preprocessing/pp042.F +++ b/flang/test/Preprocessing/pp042.F @@ -1,6 +1,8 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK-NOT: goto 2 * #define c 1, then use c as label in fixed-form #define c 1 -c print *, 'pp042.F pass'; goto 2 - print *, 'pp042.F FAIL' +c print *, 'pp042.F yes'; goto 2 + print *, 'pp042.F no' 2 continue end diff --git a/flang/test/Preprocessing/pp043.F b/flang/test/Preprocessing/pp043.F index be0069cf85579..a079466e64f92 100644 --- a/flang/test/Preprocessing/pp043.F +++ b/flang/test/Preprocessing/pp043.F @@ -1,11 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(kwm.eq.777)then * #define with # in column 6 is a continuation line in fixed-form integer, parameter :: defineKWM666 = 555 integer, parameter :: KWM = #define KWM 666 ++222 if (KWM .eq. 777) then - print *, 'pp043.F pass' + print *, 'pp043.F yes' else - print *, 'pp043.F FAIL: ', KWM + print *, 'pp043.F no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp044.F b/flang/test/Preprocessing/pp044.F index 72ce6cc411598..c14b29c3050cc 100644 --- a/flang/test/Preprocessing/pp044.F +++ b/flang/test/Preprocessing/pp044.F @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK-NOT:z=111 * #define directive amid continuations integer, parameter :: KWM = 222, KWM111 = 333, KWM222 = 555 integer, parameter :: KWMKWM = 333 @@ -5,8 +7,8 @@ #define KWM 111 +KWM+444 if (z .EQ. 777) then - print *, 'pass' + print *, 'yes' else - print *, 'FAIL', z + print *, 'no', z end if end diff --git a/flang/test/Preprocessing/pp101.F90 b/flang/test/Preprocessing/pp101.F90 index 694201a8f33e0..b0f860157c82e 100644 --- a/flang/test/Preprocessing/pp101.F90 +++ b/flang/test/Preprocessing/pp101.F90 @@ -1,9 +1,11 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777 .eq. 777) then ! keyword macros integer, parameter :: KWM = 666 #define KWM 777 if (KWM .eq. 777) then - print *, 'pp101.F90 pass' + print *, 'pp101.F90 yes' else - print *, 'pp101.F90 FAIL: ', KWM + print *, 'pp101.F90 no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp102.F90 b/flang/test/Preprocessing/pp102.F90 index 22e4613b3b18a..e2c3207a4c248 100644 --- a/flang/test/Preprocessing/pp102.F90 +++ b/flang/test/Preprocessing/pp102.F90 @@ -1,10 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(kwm .eq. 777) then ! #undef integer, parameter :: KWM = 777 #define KWM 666 #undef KWM if (KWM .eq. 777) then - print *, 'pp102.F90 pass' + print *, 'pp102.F90 yes' else - print *, 'pp102.F90 FAIL: ', KWM + print *, 'pp102.F90 no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp103.F90 b/flang/test/Preprocessing/pp103.F90 index 9df4c9dbdf7b0..3309d7f8205f0 100644 --- a/flang/test/Preprocessing/pp103.F90 +++ b/flang/test/Preprocessing/pp103.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = ((666)+111) ! function-like macros integer function IFLM(x) integer :: x @@ -8,8 +10,8 @@ program main integer :: res res = IFLM(666) if (res .eq. 777) then - print *, 'pp103.F90 pass' + print *, 'pp103.F90 yes' else - print *, 'pp103.F90 FAIL: ', res + print *, 'pp103.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp104.F90 b/flang/test/Preprocessing/pp104.F90 index b15f0db7c5b97..51248c22154b9 100644 --- a/flang/test/Preprocessing/pp104.F90 +++ b/flang/test/Preprocessing/pp104.F90 @@ -1,9 +1,11 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(kwm .eq. 777) then ! KWMs case-sensitive integer, parameter :: KWM = 777 #define KWM 666 if (kwm .eq. 777) then - print *, 'pp104.F90 pass' + print *, 'pp104.F90 yes' else - print *, 'pp104.F90 FAIL: ', kwm + print *, 'pp104.F90 no: ', kwm end if end diff --git a/flang/test/Preprocessing/pp105.F90 b/flang/test/Preprocessing/pp105.F90 index cd475db01c39e..898a4a1681377 100644 --- a/flang/test/Preprocessing/pp105.F90 +++ b/flang/test/Preprocessing/pp105.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = 777 ! KWM call name split across continuation, with leading & integer, parameter :: KWM = 666 #define KWM 777 @@ -5,8 +7,8 @@ res = KW& &M if (res .eq. 777) then - print *, 'pp105.F90 pass' + print *, 'pp105.F90 yes' else - print *, 'pp105.F90 FAIL: ', res + print *, 'pp105.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp106.F90 b/flang/test/Preprocessing/pp106.F90 index e169ff70b2cec..d83085fc1d718 100644 --- a/flang/test/Preprocessing/pp106.F90 +++ b/flang/test/Preprocessing/pp106.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = 777 ! ditto, with & ! comment integer, parameter :: KWM = 666 #define KWM 777 @@ -5,8 +7,8 @@ res = KW& ! comment &M if (res .eq. 777) then - print *, 'pp106.F90 pass' + print *, 'pp106.F90 yes' else - print *, 'pp106.F90 FAIL: ', res + print *, 'pp106.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp107.F90 b/flang/test/Preprocessing/pp107.F90 index bf6d427c04007..6973127a4cfab 100644 --- a/flang/test/Preprocessing/pp107.F90 +++ b/flang/test/Preprocessing/pp107.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = kwm ! KWM call name split across continuation, no leading &, with & ! comment integer, parameter :: KWM = 666 #define KWM 777 @@ -5,8 +7,8 @@ res = KW& ! comment M if (res .eq. 777) then - print *, 'pp107.F90 pass' + print *, 'pp107.F90 yes' else - print *, 'pp107.F90 FAIL: ', res + print *, 'pp107.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp108.F90 b/flang/test/Preprocessing/pp108.F90 index 7ce6ccbdedc17..b07ec984fd811 100644 --- a/flang/test/Preprocessing/pp108.F90 +++ b/flang/test/Preprocessing/pp108.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = kwm ! ditto, but without & ! comment integer, parameter :: KWM = 666 #define KWM 777 @@ -5,8 +7,8 @@ res = KW& M if (res .eq. 777) then - print *, 'pp108.F90 pass' + print *, 'pp108.F90 yes' else - print *, 'pp108.F90 FAIL: ', res + print *, 'pp108.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp109.F90 b/flang/test/Preprocessing/pp109.F90 index a80579d18b3ee..e75fd10c1cb88 100644 --- a/flang/test/Preprocessing/pp109.F90 +++ b/flang/test/Preprocessing/pp109.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = ((666)+111) ! FLM call name split with leading & integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFL& &M(666) if (res .eq. 777) then - print *, 'pp109.F90 pass' + print *, 'pp109.F90 yes' else - print *, 'pp109.F90 FAIL: ', res + print *, 'pp109.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp110.F90 b/flang/test/Preprocessing/pp110.F90 index f5bf3b1867f48..681408ebfddf8 100644 --- a/flang/test/Preprocessing/pp110.F90 +++ b/flang/test/Preprocessing/pp110.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = ((666)+111) ! ditto, with & ! comment integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFL& ! comment &M(666) if (res .eq. 777) then - print *, 'pp110.F90 pass' + print *, 'pp110.F90 yes' else - print *, 'pp110.F90 FAIL: ', res + print *, 'pp110.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp111.F90 b/flang/test/Preprocessing/pp111.F90 index 668fcdc5f8d39..4b49bf6ad6c92 100644 --- a/flang/test/Preprocessing/pp111.F90 +++ b/flang/test/Preprocessing/pp111.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = iflm (666) ! FLM call name split across continuation, no leading &, with & ! comment integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFL& ! comment M(666) if (res .eq. 777) then - print *, 'pp111.F90 pass' + print *, 'pp111.F90 yes' else - print *, 'pp111.F90 FAIL: ', res + print *, 'pp111.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp112.F90 b/flang/test/Preprocessing/pp112.F90 index 0a3c7f8906dc7..9828366bef73a 100644 --- a/flang/test/Preprocessing/pp112.F90 +++ b/flang/test/Preprocessing/pp112.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = iflm (666) ! ditto, but without & ! comment integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFL& M(666) if (res .eq. 777) then - print *, 'pp112.F90 pass' + print *, 'pp112.F90 yes' else - print *, 'pp112.F90 FAIL: ', res + print *, 'pp112.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp113.F90 b/flang/test/Preprocessing/pp113.F90 index 4c928033638f9..56e34389068f3 100644 --- a/flang/test/Preprocessing/pp113.F90 +++ b/flang/test/Preprocessing/pp113.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = ((666)+111) ! FLM call split across continuation between name and (, leading & integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFLM& &(666) if (res .eq. 777) then - print *, 'pp113.F90 pass' + print *, 'pp113.F90 yes' else - print *, 'pp113.F90 FAIL: ', res + print *, 'pp113.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp114.F90 b/flang/test/Preprocessing/pp114.F90 index f6c0e0263a2f3..4c44759ba96eb 100644 --- a/flang/test/Preprocessing/pp114.F90 +++ b/flang/test/Preprocessing/pp114.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = ((666)+111) ! ditto, with & ! comment, leading & integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFLM& ! comment &(666) if (res .eq. 777) then - print *, 'pp114.F90 pass' + print *, 'pp114.F90 yes' else - print *, 'pp114.F90 FAIL: ', res + print *, 'pp114.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp115.F90 b/flang/test/Preprocessing/pp115.F90 index 4a38aca533118..290f62e68948b 100644 --- a/flang/test/Preprocessing/pp115.F90 +++ b/flang/test/Preprocessing/pp115.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = iflm (666) ! ditto, with & ! comment, no leading & integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFLM& ! comment (666) if (res .eq. 777) then - print *, 'pp115.F90 pass' + print *, 'pp115.F90 yes' else - print *, 'pp115.F90 FAIL: ', res + print *, 'pp115.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp116.F90 b/flang/test/Preprocessing/pp116.F90 index 8708f79347cc6..d41fab771f842 100644 --- a/flang/test/Preprocessing/pp116.F90 +++ b/flang/test/Preprocessing/pp116.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = iflm (666) ! FLM call split between name and (, no leading & integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFLM& (666) if (res .eq. 777) then - print *, 'pp116.F90 pass' + print *, 'pp116.F90 yes' else - print *, 'pp116.F90 FAIL: ', res + print *, 'pp116.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp117.F90 b/flang/test/Preprocessing/pp117.F90 index 8b8687f03743d..c8d1c2ff31304 100644 --- a/flang/test/Preprocessing/pp117.F90 +++ b/flang/test/Preprocessing/pp117.F90 @@ -1,10 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777 .eq. 777) then ! KWM rescan integer, parameter :: KWM = 666, KWM2 = 667 #define KWM2 777 #define KWM KWM2 if (KWM .eq. 777) then - print *, 'pp117.F90 pass' + print *, 'pp117.F90 yes' else - print *, 'pp117.F90 FAIL: ', KWM + print *, 'pp117.F90 no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp118.F90 b/flang/test/Preprocessing/pp118.F90 index 014d99791f1aa..e71cc410823ba 100644 --- a/flang/test/Preprocessing/pp118.F90 +++ b/flang/test/Preprocessing/pp118.F90 @@ -1,11 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(kwm2 .eq. 777) then ! KWM rescan with #undef, proving rescan after expansion integer, parameter :: KWM2 = 777, KWM = 667 #define KWM2 666 #define KWM KWM2 #undef KWM2 if (KWM .eq. 777) then - print *, 'pp118.F90 pass' + print *, 'pp118.F90 yes' else - print *, 'pp118.F90 FAIL: ', KWM + print *, 'pp118.F90 no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp119.F90 b/flang/test/Preprocessing/pp119.F90 index 37470de411a4e..ea9a15cb02de4 100644 --- a/flang/test/Preprocessing/pp119.F90 +++ b/flang/test/Preprocessing/pp119.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = ((666)+111) ! FLM rescan integer function IFLM(x) integer :: x @@ -10,8 +12,8 @@ program main integer :: res res = IFLM(666) if (res .eq. 777) then - print *, 'pp119.F90 pass' + print *, 'pp119.F90 yes' else - print *, 'pp119.F90 FAIL: ', res + print *, 'pp119.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp120.F90 b/flang/test/Preprocessing/pp120.F90 index f7e0ae1034908..9c4b88c8d15c9 100644 --- a/flang/test/Preprocessing/pp120.F90 +++ b/flang/test/Preprocessing/pp120.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = ((111)+666) ! FLM expansion of argument integer function IFLM(x) integer :: x @@ -10,8 +12,8 @@ program main integer :: res res = IFLM(KWM) if (res .eq. 777) then - print *, 'pp120.F90 pass' + print *, 'pp120.F90 yes' else - print *, 'pp120.F90 FAIL: ', res + print *, 'pp120.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp121.F90 b/flang/test/Preprocessing/pp121.F90 index bd855fe2f6ab4..cdf25e15ee44e 100644 --- a/flang/test/Preprocessing/pp121.F90 +++ b/flang/test/Preprocessing/pp121.F90 @@ -1,10 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: ch = 'KWM' +! CHECK: if(ch .eq. 'KWM') then ! KWM NOT expanded in 'literal' #define KWM 666 character(len=3) :: ch ch = 'KWM' if (ch .eq. 'KWM') then - print *, 'pp121.F90 pass' + print *, 'pp121.F90 yes' else - print *, 'pp121.F90 FAIL: ', ch + print *, 'pp121.F90 no: ', ch end if end diff --git a/flang/test/Preprocessing/pp122.F90 b/flang/test/Preprocessing/pp122.F90 index dbad83a61c6ac..14c8762af1c7e 100644 --- a/flang/test/Preprocessing/pp122.F90 +++ b/flang/test/Preprocessing/pp122.F90 @@ -1,10 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: ch = "KWM" ! KWM NOT expanded in "literal" #define KWM 666 character(len=3) :: ch ch = "KWM" if (ch .eq. 'KWM') then - print *, 'pp122.F90 pass' + print *, 'pp122.F90 yes' else - print *, 'pp122.F90 FAIL: ', ch + print *, 'pp122.F90 no: ', ch end if end diff --git a/flang/test/Preprocessing/pp123.F90 b/flang/test/Preprocessing/pp123.F90 index 6e6c45244b8a9..1768cec1c23ba 100644 --- a/flang/test/Preprocessing/pp123.F90 +++ b/flang/test/Preprocessing/pp123.F90 @@ -1,11 +1,13 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: ch = 3hKWM ! KWM NOT expanded in Hollerith literal #define KWM 666 #define HKWM 667 character(len=3) :: ch ch = 3HKWM if (ch .eq. 'KWM') then - print *, 'pp123.F90 pass' + print *, 'pp123.F90 yes' else - print *, 'pp123.F90 FAIL: ', ch + print *, 'pp123.F90 no: ', ch end if end diff --git a/flang/test/Preprocessing/pp124.F90 b/flang/test/Preprocessing/pp124.F90 index 2cf4d56dba238..bb011515ce1c2 100644 --- a/flang/test/Preprocessing/pp124.F90 +++ b/flang/test/Preprocessing/pp124.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: 100 format(3hKWM) ! KWM NOT expanded in Hollerith in FORMAT #define KWM 666 #define HKWM 667 @@ -5,8 +7,8 @@ 100 format(3HKWM) write(ch, 100) if (ch .eq. 'KWM') then - print *, 'pp124.F90 pass' + print *, 'pp124.F90 yes' else - print *, 'pp124.F90 FAIL: ', ch + print *, 'pp124.F90 no: ', ch end if end diff --git a/flang/test/Preprocessing/pp125.F90 b/flang/test/Preprocessing/pp125.F90 index 5f3875d8e88ef..cf3909b888240 100644 --- a/flang/test/Preprocessing/pp125.F90 +++ b/flang/test/Preprocessing/pp125.F90 @@ -1,9 +1,11 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777 .eq. 777) then ! #DEFINE works in free form integer, parameter :: KWM = 666 #DEFINE KWM 777 if (KWM .eq. 777) then - print *, 'pp125.F90 pass' + print *, 'pp125.F90 yes' else - print *, 'pp125.F90 FAIL: ', KWM + print *, 'pp125.F90 no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp126.F90 b/flang/test/Preprocessing/pp126.F90 index c2684c51413b7..fefb7fcf118f4 100644 --- a/flang/test/Preprocessing/pp126.F90 +++ b/flang/test/Preprocessing/pp126.F90 @@ -1,10 +1,12 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: if(777 .eq. 777) then ! \ newline works in #define integer, parameter :: KWM = 666 #define KWM 77\ 7 if (KWM .eq. 777) then - print *, 'pp126.F90 pass' + print *, 'pp126.F90 yes' else - print *, 'pp126.F90 FAIL: ', KWM + print *, 'pp126.F90 no: ', KWM end if end diff --git a/flang/test/Preprocessing/pp127.F90 b/flang/test/Preprocessing/pp127.F90 index 19f83b6afbc9a..08feedf95305d 100644 --- a/flang/test/Preprocessing/pp127.F90 +++ b/flang/test/Preprocessing/pp127.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = iflm(666 ) ! FLM call with closing ')' on next line (not a continuation) integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFLM(666 ) if (res .eq. 777) then - print *, 'pp127.F90 pass' + print *, 'pp127.F90 yes' else - print *, 'pp127.F90 FAIL: ', res + print *, 'pp127.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp128.F90 b/flang/test/Preprocessing/pp128.F90 index 84b338bc7cc98..46918e05f5c39 100644 --- a/flang/test/Preprocessing/pp128.F90 +++ b/flang/test/Preprocessing/pp128.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: res = iflm ! FLM call with '(' on next line (not a continuation) integer function IFLM(x) integer :: x @@ -9,8 +11,8 @@ program main res = IFLM (666) if (res .eq. 777) then - print *, 'pp128.F90 pass' + print *, 'pp128.F90 yes' else - print *, 'pp128.F90 FAIL: ', res + print *, 'pp128.F90 no: ', res end if end diff --git a/flang/test/Preprocessing/pp129.F90 b/flang/test/Preprocessing/pp129.F90 index a8eea86996838..ab7e7f0baa8b8 100644 --- a/flang/test/Preprocessing/pp129.F90 +++ b/flang/test/Preprocessing/pp129.F90 @@ -1,5 +1,7 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK-NOT: stop ! #define KWM !, then KWM works as comment line initiator #define KWM ! KWM print *, 'pp129.F90 FAIL HARD!'; stop - print *, 'pp129.F90 pass' + print *, 'pp129.F90 yes' end diff --git a/flang/test/Preprocessing/pp130.F90 b/flang/test/Preprocessing/pp130.F90 index c3d8079210c66..af4ad126e6fa4 100644 --- a/flang/test/Preprocessing/pp130.F90 +++ b/flang/test/Preprocessing/pp130.F90 @@ -1,3 +1,5 @@ +! RUN: %f18 -E %s 2>&1 | FileCheck %s +! CHECK: j = j + & ! #define KWM &, use for continuation w/o pasting (ifort and nag seem to continue #define) #define KWM & @@ -6,8 +8,8 @@ j = j + KWM 111 if (j .eq. 777) then - print *, 'pp130.F90 pass' + print *, 'pp130.F90 yes' else - print *, 'pp130.F90 FAIL', j + print *, 'pp130.F90 no', j end if end diff --git a/flang/test/Semantics/acc-resolve01.f90 b/flang/test/Semantics/acc-resolve01.f90 new file mode 100644 index 0000000000000..7ca0aa3627cee --- /dev/null +++ b/flang/test/Semantics/acc-resolve01.f90 @@ -0,0 +1,22 @@ +! RUN: %S/test_errors.sh %s %t %f18 -fopenacc + +! Data-Mapping Attribute Clauses +! 2.15.14 default Clause + +subroutine default_none() + integer a(3) + + A = 1 + B = 2 + !$acc parallel default(none) private(c) + !ERROR: The DEFAULT(NONE) clause requires that 'a' must be listed in a data-mapping clause + A(1:2) = 3 + !ERROR: The DEFAULT(NONE) clause requires that 'b' must be listed in a data-mapping clause + B = 4 + C = 5 + !$acc end parallel +end subroutine default_none + +program mm + call default_none() +end diff --git a/flang/test/Semantics/acc-resolve02.f90 b/flang/test/Semantics/acc-resolve02.f90 new file mode 100644 index 0000000000000..da1a417bba52b --- /dev/null +++ b/flang/test/Semantics/acc-resolve02.f90 @@ -0,0 +1,17 @@ +! RUN: %S/test_errors.sh %s %t %f18 -fopenacc + +subroutine compute() + integer :: a(3), c, i + + a = 1 + !ERROR: 'c' appears in more than one data-sharing clause on the same OpenACC directive + !$acc parallel firstprivate(c) private(c) + do i = 1, 3 + a(i) = c + end do + !$acc end parallel +end subroutine compute + +program mm + call compute() +end diff --git a/flang/test/Semantics/acc-symbols01.f90 b/flang/test/Semantics/acc-symbols01.f90 new file mode 100644 index 0000000000000..23d54eb93fbef --- /dev/null +++ b/flang/test/Semantics/acc-symbols01.f90 @@ -0,0 +1,26 @@ +! RUN: %S/test_symbols.sh %s %t %f18 -fopenacc + +!DEF: /mm MainProgram +program mm + !DEF: /mm/x ObjectEntity REAL(4) + !DEF: /mm/y ObjectEntity REAL(4) + real x, y + !DEF: /mm/a ObjectEntity INTEGER(4) + !DEF: /mm/b ObjectEntity INTEGER(4) + !DEF: /mm/c ObjectEntity INTEGER(4) + !DEF: /mm/i ObjectEntity INTEGER(4) + integer a(10), b(10), c(10), i + !REF: /mm/b + b = 2 + !$acc parallel present(c) firstprivate(b) private(a) + !$acc loop + !DEF: /mm/Block1/i (AccPrivate, AccPreDetermined) HostAssoc INTEGER(4) + do i=1,10 + !DEF: /mm/Block1/a (AccPrivate) HostAssoc INTEGER(4) + !REF: /mm/Block1/i + !DEF: /mm/Block1/b (AccFirstPrivate) HostAssoc INTEGER(4) + a(i) = b(i) + end do + !$acc end parallel + end program + diff --git a/flang/test/Semantics/array-constr-values.f90 b/flang/test/Semantics/array-constr-values.f90 new file mode 100644 index 0000000000000..30739f8c095bc --- /dev/null +++ b/flang/test/Semantics/array-constr-values.f90 @@ -0,0 +1,60 @@ +! RUN: %S/test_errors.sh %s %t %f18 +! Confirm enforcement of constraints and restrictions in 7.8 +! C7110, C7111, C7112, C7113, C7114, C7115 + +subroutine arrayconstructorvalues() + integer :: intarray(5) + integer(KIND=8) :: k8 = 20 + + TYPE EMPLOYEE + INTEGER AGE + CHARACTER (LEN = 30) NAME + END TYPE EMPLOYEE + TYPE EMPLOYEER + CHARACTER (LEN = 30) NAME + END TYPE EMPLOYEER + + TYPE(EMPLOYEE) :: emparray(3) + class(*), pointer :: unlim_polymorphic + TYPE, ABSTRACT :: base_type + INTEGER :: CARPRIZE + END TYPE + ! Different declared type + !ERROR: Values in array constructor must have the same declared type when no explicit type appears + intarray = (/ 1, 2, 3, 4., 5/) ! C7110 + ! Different kind type parameter + !ERROR: Values in array constructor must have the same declared type when no explicit type appears + intarray = (/ 1,2,3,4, k8 /) ! C7110 + + ! C7111 + !ERROR: Value in array constructor of type 'LOGICAL(4)' could not be converted to the type of the array 'INTEGER(4)' + intarray = [integer:: .true., 2, 3, 4, 5] + !ERROR: Value in array constructor of type 'CHARACTER(1)' could not be converted to the type of the array 'INTEGER(4)' + intarray = [integer:: "RAM stores information", 2, 3, 4, 5] + !ERROR: Value in array constructor of type 'employee' could not be converted to the type of the array 'INTEGER(4)' + intarray = [integer:: EMPLOYEE (19, "Jack"), 2, 3, 4, 5] + + ! C7112 + !ERROR: Value in array constructor of type 'INTEGER(4)' could not be converted to the type of the array 'employee' + emparray = (/ EMPLOYEE:: EMPLOYEE(19, "Ganesh"), EMPLOYEE(22, "Omkar"), 19 /) + !ERROR: Value in array constructor of type 'employeer' could not be converted to the type of the array 'employee' + emparray = (/ EMPLOYEE:: EMPLOYEE(19, "Ganesh"), EMPLOYEE(22, "Ram"),EMPLOYEER("ShriniwasPvtLtd") /) + + ! C7113 + !ERROR: Cannot have an unlimited polymorphic value in an array constructor + !ERROR: Values in array constructor must have the same declared type when no explicit type appears + intarray = (/ unlim_polymorphic, 2, 3, 4, 5/) + + ! C7114 + !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches operand types INTEGER(4) and TYPE(base_type) + !ERROR: ABSTRACT derived type 'base_type' may not be used in a structure constructor + !ERROR: Values in array constructor must have the same declared type when no explicit type appears + intarray = (/ base_type(10), 2, 3, 4, 5 /) +end subroutine arrayconstructorvalues +subroutine checkC7115() + real, dimension(10), parameter :: good1 = [(99.9, i = 1, 10)] + real, dimension(100), parameter :: good2 = [((88.8, i = 1, 10), j = 1, 10)] + !ERROR: Implied DO index is active in surrounding implied DO loop and may not have the same name + !ERROR: 'i' is already declared in this scoping unit + real, dimension(100), parameter :: bad = [((88.8, i = 1, 10), i = 1, 10)] +end subroutine checkC7115 diff --git a/flang/test/Semantics/bindings01.f90 b/flang/test/Semantics/bindings01.f90 index 26dd25a24650c..f2d5876167c25 100644 --- a/flang/test/Semantics/bindings01.f90 +++ b/flang/test/Semantics/bindings01.f90 @@ -1,6 +1,6 @@ ! RUN: %S/test_errors.sh %s %t %f18 ! Confirm enforcement of constraints and restrictions in 7.5.7.3 -! and C733, C734 and C779, C780, C781, C782, C783, C784, and C785. +! and C733, C734 and C779, C780, C782, C783, C784, and C785. module m !ERROR: An ABSTRACT derived type must be extensible diff --git a/flang/test/Semantics/boz-literal-constants.f90 b/flang/test/Semantics/boz-literal-constants.f90 new file mode 100644 index 0000000000000..23155cc29cb7d --- /dev/null +++ b/flang/test/Semantics/boz-literal-constants.f90 @@ -0,0 +1,79 @@ +! RUN: %S/test_errors.sh %s %t %f18 +! Confirm enforcement of constraints and restrictions in 7.7 +! C7107, C7108, C7109 + +subroutine bozchecks + ! Type declaration statements + integer :: f, realpart = B"0101", img = B"1111", resint + logical :: resbit + complex :: rescmplx + real :: dbl, e + ! C7107 + !ERROR: Invalid digit ('a') in BOZ literal 'b"110a"' + integer, parameter :: a = B"110A" + !ERROR: Invalid digit ('2') in BOZ literal 'b"1232"' + integer, parameter :: b = B"1232" + !ERROR: BOZ literal 'b"010101010101010101010101011111111111111111111111111111111111111111111111111111111111111111111111111111111111000000000000000000000000000000000000"' too large + integer, parameter :: b1 = B"010101010101010101010101011111111111111111111& + &111111111111111111111111111111111111111111111& + &111111111111111111000000000000000000000000000& + &000000000" + ! C7108 + !ERROR: Invalid digit ('8') in BOZ literal 'o"8"' + integer :: c = O"8" + !ERROR: Invalid digit ('a') in BOZ literal 'o"a"' + integer :: d = O"A" + + ! C7109 + ! A) can appear only in data statement + ! B) Argument to intrinsics listed from 16.9 below + ! BGE, BGT, BLE, BLT, CMPLX, DBLE, DSHIFTL, + ! DSHIFTR, IAND, IEOR, INT, IOR, MERGE_BITS, REAL + + ! part A + data f / Z"AA" / ! OK + !ERROR: DATA statement value could not be converted to the type 'COMPLEX(4)' of the object 'rescmplx' + data rescmplx / B"010101" / + ! part B + resbit = BGE(B"0101", B"1111") + resbit = BGT(Z"0101", B"1111") + resbit = BLE(B"0101", B"1111") + resbit = BLT(B"0101", B"1111") + + res = CMPLX (realpart, img, 4) + res = CMPLX (B"0101", B"1111", 4) + + dbl = DBLE(B"1111") + dbl = DBLE(realpart) + + !ERROR: Typeless (BOZ) not allowed for both 'i=' & 'j=' arguments + dbl = DSHIFTL(B"0101",B"0101",2) + !ERROR: Typeless (BOZ) not allowed for both 'i=' & 'j=' arguments + dbl = DSHIFTR(B"1010",B"1010",2) + dbl = DSHIFTL(B"0101",5,2) ! OK + dbl = DSHIFTR(B"1010",5,2) ! OK + + !ERROR: Typeless (BOZ) not allowed for both 'i=' & 'j=' arguments + resint = IAND(B"0001", B"0011") + resint = IAND(B"0001", 3) + + !ERROR: Typeless (BOZ) not allowed for both 'i=' & 'j=' arguments + resint = IEOR(B"0001", B"0011") + resint = IEOR(B"0001", 3) + + resint = INT(B"1010") + + !ERROR: Typeless (BOZ) not allowed for both 'i=' & 'j=' arguments + res = IOR(B"0101", B"0011") + res = IOR(B"0101", 3) + + res = MERGE_BITS(13,3,11) + res = MERGE_BITS(B"1101",3,11) + !ERROR: Typeless (BOZ) not allowed for both 'i=' & 'j=' arguments + res = MERGE_BITS(B"1101",B"0011",11) + !ERROR: Typeless (BOZ) not allowed for both 'i=' & 'j=' arguments + res = MERGE_BITS(B"1101",B"0011",B"1011") + res = MERGE_BITS(B"1101",3,B"1011") + + res = REAL(B"1101") +end subroutine diff --git a/flang/test/Semantics/reshape.f90 b/flang/test/Semantics/reshape.f90 new file mode 100644 index 0000000000000..7749df60b0dca --- /dev/null +++ b/flang/test/Semantics/reshape.f90 @@ -0,0 +1,50 @@ +! RUN: %S/test_errors.sh %s %t %f18 + +!Tests for RESHAPE +program reshaper + ! RESHAPE with arguments SOURCE and SHAPE + integer, parameter :: array1(2,3) = RESHAPE([(n, n=1,6)], [2,3]) + ! RESHAPE with arguments SOURCE, SHAPE, and PAD + integer :: array2(2,3) = RESHAPE([(n, n=1,4)], [2,3], [99]) + ! RESHAPE with arguments SOURCE, SHAPE, PAD, and ORDER + integer :: array3(2,3) = RESHAPE([(n, n=1,4)], [2,3], [99], [2, 1]) + !ERROR: Too few elements in 'source=' argument and 'pad=' argument is not present or has null size + integer :: array4(2,3) = RESHAPE([(n, n=1,5)], [2,3]) + !ERROR: Actual argument for 'shape=' has bad type 'REAL(4)' + integer :: array5(2,3) = RESHAPE([(n, n=1,6)], [2.2,3.3]) + !ERROR: 'shape=' argument must be an array of rank 1 + integer :: array6(2,3) = RESHAPE([(n, n=1,6)], RESHAPE([(n, n=1,6)], [2,3])) + !ERROR: 'shape=' argument must be an array of rank 1 + integer :: array7(2,3) = RESHAPE([(n, n=1,4)], 343) + !ERROR: Actual argument for 'pad=' has bad type or kind 'INTEGER(8)' + integer :: array8(2,3) = RESHAPE([(n, n=1,4)], [2,3], [99_8]) + !ERROR: Actual argument for 'pad=' has bad type or kind 'REAL(4)' + real :: array9(2,3) = RESHAPE([(n, n=1,4)], [2,3], [99.9]) + !ERROR: Invalid 'order=' argument in RESHAPE + real :: array10(2,3) = RESHAPE([(n,n=1,4)],[2,3],[99],[2,3]) + !ERROR: Actual argument for 'order=' has bad type 'REAL(4)' + real :: array11(2,3) = RESHAPE([(n, n=1,4)], [2,3], [99], [2.2,3.3]) + !ERROR: Invalid 'order=' argument in RESHAPE + real :: array12(2,3) = RESHAPE([(n, n=1,4)], [2,3], [99], [1]) + !ERROR: Invalid 'order=' argument in RESHAPE + real :: array13(2,3) = RESHAPE([(n, n = 1, 4)], [2, 3], [99], [1, 1]) + + ! Examples that have caused problems + integer :: array14(0,0,0) = RESHAPE([(n,n=1,0)],[0,0,0]) + integer, parameter :: array15(1) = RESHAPE([(n,n=1,2)],[1]) + integer, parameter :: array16(1) = RESHAPE([(n,n=1,8)],[1], [0], array15) + integer, parameter, dimension(3,4) :: array17 = 3 + integer, parameter, dimension(3,4) :: array18 = RESHAPE(array17, [3,4]) + ! Implicit reshape of array of components + type :: dType + integer :: field(2) + end type dType + type(dType), parameter :: array19(*) = [dType::dType(field=[1,2])] + logical, parameter :: lVar = all(array19(:)%field(1) == [2]) + + !ERROR: Size of 'shape=' argument must not be greater than 15 + CALL ext_sub(RESHAPE([(n, n=1,20)], & + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])) + !ERROR: 'shape=' argument must not have a negative extent + CALL ext_sub(RESHAPE([(n, n=1,20)], [1, -5, 3])) +end program reshaper diff --git a/flang/test/Semantics/resolve80.f90 b/flang/test/Semantics/resolve80.f90 index 7e9df344a1611..689187d4e15ed 100644 --- a/flang/test/Semantics/resolve80.f90 +++ b/flang/test/Semantics/resolve80.f90 @@ -26,7 +26,7 @@ module m !WARNING: Attribute 'PASS' cannot be used more than once procedure(subPass), pass, deferred, pass :: passBinding !ERROR: Attributes 'PASS' and 'NOPASS' conflict with each other - procedure(subPassNopass), pass, deferred, nopass :: passNopassBinding + procedure(subPassNopass), pass, deferred, nopass :: passNopassBinding ! C781 end type boundProcType contains diff --git a/flang/test/Semantics/resolve91.f90 b/flang/test/Semantics/resolve91.f90 index 2fbcaa384dfc1..c94cda458f3f0 100644 --- a/flang/test/Semantics/resolve91.f90 +++ b/flang/test/Semantics/resolve91.f90 @@ -63,3 +63,14 @@ module m7 !ERROR: Derived type 'ubound' not found integer :: ivar = ubound(iarray)(1) end module m7 + +module m8 + integer :: iVar = 3 + !ERROR: The type of 'ivar' has already been declared + integer :: iVar = 4 + integer, target :: jVar = 5 + integer, target :: kVar = 5 + integer, pointer :: pVar => jVar + !ERROR: The type of 'pvar' has already been declared + integer, pointer :: pVar => kVar +end module m8 diff --git a/flang/test/Semantics/selecttype01.f90 b/flang/test/Semantics/selecttype01.f90 index fe9838ae2760f..c726c232e18de 100644 --- a/flang/test/Semantics/selecttype01.f90 +++ b/flang/test/Semantics/selecttype01.f90 @@ -119,6 +119,7 @@ subroutine CheckC1159b integer :: x !ERROR: Selector 'x' in SELECT TYPE statement must be polymorphic select type (a => x) + !ERROR: If selector is not unlimited polymorphic, an intrinsic type specification must not be specified in the type guard statement type is (integer) print *,'integer ',a end select @@ -127,6 +128,7 @@ subroutine CheckC1159b subroutine CheckC1159c !ERROR: Selector 'x' in SELECT TYPE statement must be polymorphic select type (a => x) + !ERROR: If selector is not unlimited polymorphic, an intrinsic type specification must not be specified in the type guard statement type is (integer) print *,'integer ',a end select @@ -164,6 +166,16 @@ subroutine CheckC1162 type is (extsquare) !Handle same types type is (rectangle) + !ERROR: If selector is not unlimited polymorphic, an intrinsic type specification must not be specified in the type guard statement + type is(integer) + !ERROR: If selector is not unlimited polymorphic, an intrinsic type specification must not be specified in the type guard statement + type is(real) + !ERROR: If selector is not unlimited polymorphic, an intrinsic type specification must not be specified in the type guard statement + type is(logical) + !ERROR: If selector is not unlimited polymorphic, an intrinsic type specification must not be specified in the type guard statement + type is(character(len=*)) + !ERROR: If selector is not unlimited polymorphic, an intrinsic type specification must not be specified in the type guard statement + type is(complex) end select !Unlimited polymorphic objects are allowed. @@ -187,6 +199,12 @@ subroutine CheckC1163 !ERROR: Type specification 'square' conflicts with previous type specification class is (square) end select + select type (unlim_polymorphic) + type is (INTEGER(4)) + type is (shape) + !ERROR: Type specification 'INTEGER(4)' conflicts with previous type specification + type is (INTEGER(4)) + end select end subroutine CheckC1164 diff --git a/flang/test/Semantics/symbol02.f90 b/flang/test/Semantics/symbol02.f90 index 4d73edd660446..add42939e4647 100644 --- a/flang/test/Semantics/symbol02.f90 +++ b/flang/test/Semantics/symbol02.f90 @@ -44,7 +44,7 @@ subroutine s2 !REF: /m/x z = x !REF: /m/s/s2/z - !REF: /m/s/y + !DEF: /m/s/s2/y HostAssoc TYPE(t) z = y !REF: /m/s/s call s diff --git a/flang/test/Semantics/symbol03.f90 b/flang/test/Semantics/symbol03.f90 index 2f3620284a833..a398d8db40b0d 100644 --- a/flang/test/Semantics/symbol03.f90 +++ b/flang/test/Semantics/symbol03.f90 @@ -11,7 +11,14 @@ program main !REF: /main/s subroutine s !DEF: /main/s/y (Implicit) ObjectEntity REAL(4) - !REF: /main/x + !DEF: /main/s/x HostAssoc INTEGER(4) y = x + contains + !DEF: /main/s/s2 (Subroutine) Subprogram + subroutine s2 + !DEF: /main/s/s2/z (Implicit) ObjectEntity REAL(4) + !DEF: /main/s/s2/x HostAssoc INTEGER(4) + z = x + end subroutine end subroutine end program diff --git a/flang/test/Semantics/symbol05.f90 b/flang/test/Semantics/symbol05.f90 index 0c36384a6c4eb..03f6fbbd989e3 100644 --- a/flang/test/Semantics/symbol05.f90 +++ b/flang/test/Semantics/symbol05.f90 @@ -33,7 +33,7 @@ subroutine s2 contains !DEF: /s2/s (Subroutine) Subprogram subroutine s - !REF: /s2/x + !DEF: /s2/s/x HostAssoc INTEGER(4) x = 1 !DEF: /s2/s/w (Implicit) ObjectEntity INTEGER(4) w = 1 diff --git a/flang/test/Semantics/symbol11.f90 b/flang/test/Semantics/symbol11.f90 index e6ae26c740e56..3d2be676967f6 100644 --- a/flang/test/Semantics/symbol11.f90 +++ b/flang/test/Semantics/symbol11.f90 @@ -71,10 +71,12 @@ subroutine s3 !DEF: /s3/Block1/y TARGET AssocEntity TYPE(t2) !REF: /s3/t2/a2 i = y%a2 - type is (integer(kind=8)) + !REF: /s3/t1 + type is (t1) !REF: /s3/i - !DEF: /s3/Block2/y TARGET AssocEntity INTEGER(8) - i = y + !DEF: /s3/Block2/y TARGET AssocEntity TYPE(t1) + !REF: /s3/t1/a1 + i = y%a1 class default !DEF: /s3/Block3/y TARGET AssocEntity CLASS(t1) print *, y diff --git a/flang/test/Semantics/symbol16.f90 b/flang/test/Semantics/symbol16.f90 index ce47134fc3377..4fa6f2b9c0eab 100644 --- a/flang/test/Semantics/symbol16.f90 +++ b/flang/test/Semantics/symbol16.f90 @@ -3,7 +3,7 @@ !DEF: /p1 MainProgram program p1 - !DEF: /p1/f (Function) Subprogram INTEGER(4) + !DEF: /p1/f (Function, StmtFunction) Subprogram INTEGER(4) !DEF: /p1/i ObjectEntity INTEGER(4) !DEF: /p1/j ObjectEntity INTEGER(4) integer f, i, j @@ -15,3 +15,13 @@ program p1 !REF: /p1/f j = f(2) end program + +!DEF: /p2 MainProgram +program p2 + !DEF: /p2/f (Function, StmtFunction) Subprogram REAL(4) + !DEF: /p2/f/x (Implicit) ObjectEntity REAL(4) + !DEF: /p2/y (Implicit) ObjectEntity REAL(4) + f(x) = y + !REF: /p2/y + y = 1.0 +end program diff --git a/flang/test/Semantics/symbol18.f90 b/flang/test/Semantics/symbol18.f90 new file mode 100644 index 0000000000000..93987f6741ed6 --- /dev/null +++ b/flang/test/Semantics/symbol18.f90 @@ -0,0 +1,31 @@ +! RUN: %S/test_symbols.sh %s %t %f18 + +! Intrinsic function in type declaration statement: type is ignored + +!DEF: /p1 MainProgram +program p1 + !DEF: /p1/cos INTRINSIC (Function) ProcEntity + integer cos + !DEF: /p1/y (Implicit) ObjectEntity REAL(4) + !REF: /p1/cos + !DEF: /p1/x (Implicit) ObjectEntity REAL(4) + y = cos(x) + !REF: /p1/y + !DEF: /p1/sin INTRINSIC (Function) ProcEntity + !REF: /p1/x + y = sin(x) + !REF: /p1/y + !DEF: /f EXTERNAL (Function, Implicit) ProcEntity REAL(4) + !REF: /p1/x + y = f(x) +end program + +!DEF: /f2 (Function) Subprogram REAL(4) +!DEF: /f2/cos EXTERNAL (Function, Implicit) ProcEntity REAL(4) +!DEF: /f2/x (Implicit) ObjectEntity REAL(4) +function f2(cos, x) + !DEF: /f2/f2 (Implicit) ObjectEntity REAL(4) + !REF: /f2/cos + !REF: /f2/x + f2 = cos(x) +end function diff --git a/flang/test/Semantics/test_symbols.sh b/flang/test/Semantics/test_symbols.sh index d2b3d688a39b8..61ff7fdb1e7bf 100755 --- a/flang/test/Semantics/test_symbols.sh +++ b/flang/test/Semantics/test_symbols.sh @@ -16,8 +16,9 @@ diffs=$temp/diffs # Strip out blank lines and all comments except "!DEF:", "!REF:", and "!$omp" sed -e 's/!\([DR]EF:\)/KEEP \1/' -e 's/!\($omp\)/KEEP \1/' \ - -e 's/!.*//' -e 's/ *$//' -e '/^$/d' -e 's/KEEP \([DR]EF:\)/!\1/' \ - -e 's/KEEP \($omp\)/!\1/' \ + -e 's/!\($acc\)/KEEP \1/' -e 's/!.*//' -e 's/ *$//' -e '/^$/d' \ + -e 's/KEEP \([DR]EF:\)/!\1/' -e 's/KEEP \($omp\)/!\1/' \ + -e 's/KEEP \($acc\)/!\1/' \ $src > $src1 egrep -v '![DR]EF:' $src1 > $src2 # strip out DEF and REF comments # compile, inserting comments for symbols: diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp index bcafb0d53cc7b..338d04e7e8f5c 100644 --- a/flang/tools/f18/f18.cpp +++ b/flang/tools/f18/f18.cpp @@ -369,20 +369,24 @@ std::string CompileOtherLanguage(std::string path, DriverOptions &driver) { return {}; } -void Link(std::vector &relocatables, DriverOptions &driver) { +void Link(std::vector &liblist, std::vector &objects, + DriverOptions &driver) { if (!ParentProcess()) { std::vector argv; for (size_t j{0}; j < driver.F18_FCArgs.size(); ++j) { argv.push_back(driver.F18_FCArgs[j].data()); } - for (auto &relo : relocatables) { - argv.push_back(relo.data()); + for (auto &obj : objects) { + argv.push_back(obj.data()); } if (!driver.outputPath.empty()) { char dashO[3] = "-o"; argv.push_back(dashO); argv.push_back(driver.outputPath.data()); } + for (auto &lib : liblist) { + argv.push_back(lib.data()); + } Exec(argv, driver.verbose); } } @@ -397,6 +401,7 @@ int main(int argc, char *const argv[]) { bool isPGF90{driver.F18_FCArgs.back().rfind("pgf90") != std::string::npos}; std::list args{argList(argc, argv)}; + std::vector objlist, liblist; std::string prefix{args.front()}; args.pop_front(); prefix += ": "; @@ -413,32 +418,37 @@ int main(int argc, char *const argv[]) { Fortran::common::IntrinsicTypeDefaultKinds defaultKinds; - std::vector fortranSources, otherSources, relocatables; + std::vector fortranSources, otherSources; bool anyFiles{false}; while (!args.empty()) { std::string arg{std::move(args.front())}; + auto dot{arg.rfind(".")}; + std::string suffix{arg.substr(dot + 1)}; + std::string prefix{arg.substr(0, 2)}; args.pop_front(); if (arg.empty()) { } else if (arg.at(0) != '-') { anyFiles = true; - auto dot{arg.rfind(".")}; if (dot == std::string::npos) { driver.F18_FCArgs.push_back(arg); } else { - std::string suffix{arg.substr(dot + 1)}; if (suffix == "f" || suffix == "F" || suffix == "ff" || suffix == "f90" || suffix == "F90" || suffix == "ff90" || suffix == "f95" || suffix == "F95" || suffix == "ff95" || suffix == "cuf" || suffix == "CUF" || suffix == "f18" || suffix == "F18" || suffix == "ff18") { fortranSources.push_back(arg); - } else if (suffix == "o" || suffix == "a") { - relocatables.push_back(arg); + } else if (suffix == "o" || suffix == "so") { + objlist.push_back(arg); + } else if (suffix == "a") { + liblist.push_back(arg); } else { otherSources.push_back(arg); } } + } else if (prefix == "-l" || suffix == "a") { + liblist.push_back(arg); } else if (arg == "-") { fortranSources.push_back("-"); } else if (arg == "--") { @@ -545,6 +555,11 @@ int main(int argc, char *const argv[]) { defaultKinds.set_defaultIntegerKind(8); defaultKinds.set_subscriptIntegerKind(8); defaultKinds.set_sizeIntegerKind(8); + if (isPGF90) { + driver.F18_FCArgs.push_back("-i8"); + } else { + driver.F18_FCArgs.push_back("-fdefault-integer-8"); + } } else if (arg == "-Mlargearray") { } else if (arg == "-Mnolargearray") { } else if (arg == "-flarge-sizes") { @@ -588,8 +603,21 @@ int main(int argc, char *const argv[]) { driver.getSymbolsSources = true; } else if (arg == "-byteswapio") { driver.byteswapio = true; // TODO: Pass to lowering, generate call - } else if (arg == "-help" || arg == "--help" || arg == "-?") { + } else if (arg == "-h" || arg == "-help" || arg == "--help" || arg == "-?") { llvm::errs() + << "f18: LLVM Fortran compiler\n" + << "\n" + << "Usage: f18 [options] \n" + << "\n" + << "Defaults:\n" + << " When invoked with input files, and no options to tell\n" + << " it otherwise, f18 will unparse its input and pass that on to an\n" + << " external compiler to continue the compilation.\n" + << " The external compiler is specified by the F18_FC environment\n" + << " variable. The default is 'gfortran'.\n" + << " If invoked with no input files, f18 reads source code from\n" + << " stdin and runs with -fdebug-measure-parse-tree -funparse.\n" + << "\n" << "f18 options:\n" << " -Mfixed | -Mfree | -ffixed-form | -ffree-form force the " "source form\n" @@ -623,7 +651,8 @@ int main(int argc, char *const argv[]) { << " -fget-symbols-sources\n" << " -v -c -o -I -D -U have their usual meanings\n" << " -help print this again\n" - << "Other options are passed through to the compiler.\n"; + << "Unrecognised options are passed through to the external compiler\n" + << "set by F18_FC (see defaults).\n"; return exitStatus; } else if (arg == "-V") { llvm::errs() << "\nf18 compiler (under development)\n"; @@ -671,23 +700,25 @@ int main(int argc, char *const argv[]) { if (!anyFiles) { driver.measureTree = true; driver.dumpUnparse = true; + llvm::outs() << "Enter Fortran source\n" + << "Use EOF character (^D) to end file\n"; CompileFortran("-", options, driver, defaultKinds); return exitStatus; } for (const auto &path : fortranSources) { std::string relo{CompileFortran(path, options, driver, defaultKinds)}; if (!driver.compileOnly && !relo.empty()) { - relocatables.push_back(relo); + objlist.push_back(relo); } } for (const auto &path : otherSources) { std::string relo{CompileOtherLanguage(path, driver)}; if (!driver.compileOnly && !relo.empty()) { - relocatables.push_back(relo); + objlist.push_back(relo); } } - if (!relocatables.empty()) { - Link(relocatables, driver); + if (!objlist.empty()) { + Link(liblist, objlist, driver); } return exitStatus; } diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt index 21da59f3afcbf..a30f0edaec615 100644 --- a/flang/unittests/CMakeLists.txt +++ b/flang/unittests/CMakeLists.txt @@ -9,6 +9,14 @@ if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) add_compile_options("-Wno-suggest-override") endif() +function(add_flang_nongtest_unittest test_name) + add_executable(${test_name}.test ${test_name}.cpp) + + target_link_libraries(${test_name}.test ${ARGN}) + + add_dependencies(FlangUnitTests ${test_name}.test) +endfunction() + add_subdirectory(Optimizer) add_subdirectory(Decimal) add_subdirectory(Evaluate) diff --git a/flang/unittests/Decimal/CMakeLists.txt b/flang/unittests/Decimal/CMakeLists.txt index f26aca5d0e9b6..112b02f9029f9 100644 --- a/flang/unittests/Decimal/CMakeLists.txt +++ b/flang/unittests/Decimal/CMakeLists.txt @@ -1,13 +1,10 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) -add_executable(quick-sanity-test - quick-sanity-test.cpp -) - -target_link_libraries(quick-sanity-test +add_flang_nongtest_unittest(quick-sanity-test FortranDecimal LLVMSupport ) +# This test is not run by default as it takes a long time to execute add_executable(thorough-test thorough-test.cpp ) @@ -16,5 +13,3 @@ target_link_libraries(thorough-test FortranDecimal LLVMSupport ) - -add_test(NAME Sanity COMMAND quick-sanity-test) diff --git a/flang/unittests/Evaluate/CMakeLists.txt b/flang/unittests/Evaluate/CMakeLists.txt index 54676b0b5a0ad..c56789b598ce7 100644 --- a/flang/unittests/Evaluate/CMakeLists.txt +++ b/flang/unittests/Evaluate/CMakeLists.txt @@ -8,43 +8,22 @@ target_link_libraries(FortranEvaluateTesting LLVMSupport ) -add_executable(leading-zero-bit-count-test - leading-zero-bit-count.cpp -) - -target_link_libraries(leading-zero-bit-count-test +add_flang_nongtest_unittest(leading-zero-bit-count FortranEvaluateTesting LLVMSupport ) -add_executable(bit-population-count-test - bit-population-count.cpp -) - -target_link_libraries(bit-population-count-test +add_flang_nongtest_unittest(bit-population-count FortranEvaluateTesting LLVMSupport ) -add_executable(uint128-test - uint128.cpp -) - -target_link_libraries(uint128-test +add_flang_nongtest_unittest(uint128 FortranEvaluateTesting LLVMSupport ) -# These routines live in lib/Common but we test them here. -add_test(UINT128 uint128-test) -add_test(Leadz leading-zero-bit-count-test) -add_test(PopPar bit-population-count-test) - -add_executable(expression-test - expression.cpp -) - -target_link_libraries(expression-test +add_flang_nongtest_unittest(expression FortranCommon FortranEvaluateTesting FortranEvaluate @@ -53,22 +32,14 @@ target_link_libraries(expression-test LLVMSupport ) -add_executable(integer-test - integer.cpp -) - -target_link_libraries(integer-test +add_flang_nongtest_unittest(integer FortranEvaluateTesting FortranEvaluate FortranSemantics LLVMSupport ) -add_executable(intrinsics-test - intrinsics.cpp -) - -target_link_libraries(intrinsics-test +add_flang_nongtest_unittest(intrinsics FortranCommon FortranEvaluateTesting FortranEvaluate @@ -79,11 +50,7 @@ target_link_libraries(intrinsics-test LLVMSupport ) -add_executable(logical-test - logical.cpp -) - -target_link_libraries(logical-test +add_flang_nongtest_unittest(logical FortranEvaluateTesting FortranEvaluate FortranSemantics @@ -96,24 +63,16 @@ target_link_libraries(logical-test # C++ exceptions are enabled for this test. set(LLVM_REQUIRES_EH ON) set(LLVM_REQUIRES_RTTI ON) -add_executable(real-test - real.cpp -) -llvm_update_compile_flags(real-test) - -target_link_libraries(real-test +add_flang_nongtest_unittest(real FortranEvaluateTesting FortranEvaluate FortranDecimal FortranSemantics LLVMSupport ) +llvm_update_compile_flags(real.test) -add_executable(reshape-test - reshape.cpp -) - -target_link_libraries(reshape-test +add_flang_nongtest_unittest(reshape FortranEvaluateTesting FortranSemantics FortranEvaluate @@ -121,11 +80,7 @@ target_link_libraries(reshape-test LLVMSupport ) -add_executable(ISO-Fortran-binding-test - ISO-Fortran-binding.cpp -) - -target_link_libraries(ISO-Fortran-binding-test +add_flang_nongtest_unittest(ISO-Fortran-binding FortranEvaluateTesting FortranEvaluate FortranSemantics @@ -133,23 +88,10 @@ target_link_libraries(ISO-Fortran-binding-test LLVMSupport ) -add_executable(folding-test - folding.cpp -) - -target_link_libraries(folding-test +add_flang_nongtest_unittest(folding FortranCommon FortranEvaluateTesting FortranEvaluate FortranSemantics LLVMSupport ) - -add_test(Expression expression-test) -add_test(Integer integer-test) -add_test(Intrinsics intrinsics-test) -add_test(Logical logical-test) -add_test(Real real-test) -add_test(RESHAPE reshape-test) -add_test(ISO-binding ISO-Fortran-binding-test) -add_test(folding folding-test) diff --git a/flang/unittests/Evaluate/intrinsics.cpp b/flang/unittests/Evaluate/intrinsics.cpp index 57f71965f8d94..7b8895b8da4c0 100644 --- a/flang/unittests/Evaluate/intrinsics.cpp +++ b/flang/unittests/Evaluate/intrinsics.cpp @@ -292,6 +292,43 @@ void TestIntrinsics() { .DoCall(); // bad type // TODO: test other intrinsics + + // Test unrestricted specific to generic name mapping (table 16.2). + TEST(table.GetGenericIntrinsicName("alog") == "log"); + TEST(table.GetGenericIntrinsicName("alog10") == "log10"); + TEST(table.GetGenericIntrinsicName("amod") == "mod"); + TEST(table.GetGenericIntrinsicName("cabs") == "abs"); + TEST(table.GetGenericIntrinsicName("ccos") == "cos"); + TEST(table.GetGenericIntrinsicName("cexp") == "exp"); + TEST(table.GetGenericIntrinsicName("clog") == "log"); + TEST(table.GetGenericIntrinsicName("csin") == "sin"); + TEST(table.GetGenericIntrinsicName("csqrt") == "sqrt"); + TEST(table.GetGenericIntrinsicName("dabs") == "abs"); + TEST(table.GetGenericIntrinsicName("dacos") == "acos"); + TEST(table.GetGenericIntrinsicName("dasin") == "asin"); + TEST(table.GetGenericIntrinsicName("datan") == "atan"); + TEST(table.GetGenericIntrinsicName("datan2") == "atan2"); + TEST(table.GetGenericIntrinsicName("dcos") == "cos"); + TEST(table.GetGenericIntrinsicName("dcosh") == "cosh"); + TEST(table.GetGenericIntrinsicName("ddim") == "dim"); + TEST(table.GetGenericIntrinsicName("dexp") == "exp"); + TEST(table.GetGenericIntrinsicName("dint") == "aint"); + TEST(table.GetGenericIntrinsicName("dlog") == "log"); + TEST(table.GetGenericIntrinsicName("dlog10") == "log10"); + TEST(table.GetGenericIntrinsicName("dmod") == "mod"); + TEST(table.GetGenericIntrinsicName("dnint") == "anint"); + TEST(table.GetGenericIntrinsicName("dsign") == "sign"); + TEST(table.GetGenericIntrinsicName("dsin") == "sin"); + TEST(table.GetGenericIntrinsicName("dsinh") == "sinh"); + TEST(table.GetGenericIntrinsicName("dsqrt") == "sqrt"); + TEST(table.GetGenericIntrinsicName("dtan") == "tan"); + TEST(table.GetGenericIntrinsicName("dtanh") == "tanh"); + TEST(table.GetGenericIntrinsicName("iabs") == "abs"); + TEST(table.GetGenericIntrinsicName("idim") == "dim"); + TEST(table.GetGenericIntrinsicName("idnint") == "nint"); + TEST(table.GetGenericIntrinsicName("isign") == "sign"); + // Test a case where specific and generic name are the same. + TEST(table.GetGenericIntrinsicName("acos") == "acos"); } } // namespace Fortran::evaluate diff --git a/flang/unittests/Lower/OpenMPLoweringTest.cpp b/flang/unittests/Lower/OpenMPLoweringTest.cpp index fd580d71fab3c..ad6fe739d16b4 100644 --- a/flang/unittests/Lower/OpenMPLoweringTest.cpp +++ b/flang/unittests/Lower/OpenMPLoweringTest.cpp @@ -71,4 +71,31 @@ TEST_F(OpenMPLoweringTest, TaskYield) { EXPECT_EQ(succeeded(taskYieldOp.verify()), true); } +TEST_F(OpenMPLoweringTest, EmptyParallel) { + // Construct a dummy parse tree node for `!OMP parallel`. + struct Fortran::parser::OmpSimpleStandaloneDirective parallelDirective( + llvm::omp::Directive::OMPD_parallel); + + // Check and lower the `!OMP parallel` node to `ParallelOp` operation of + // OpenMPDialect. + EXPECT_EQ(parallelDirective.v, llvm::omp::Directive::OMPD_parallel); + auto insertPt = mlirOpBuilder->saveInsertionPoint(); + llvm::ArrayRef argTy; + mlir::ValueRange range; + llvm::SmallVector operandSegmentSizes(6 /*Size=*/, 0 /*Value=*/); + // create and insert the operation. + auto parallelOp = mlirOpBuilder->create( + mlirOpBuilder->getUnknownLoc(), argTy, range); + parallelOp.setAttr(mlir::omp::ParallelOp::getOperandSegmentSizeAttr(), + mlirOpBuilder->getI32VectorAttr(operandSegmentSizes)); + parallelOp.getRegion().push_back(new mlir::Block{}); + auto &block = parallelOp.getRegion().back(); + mlirOpBuilder->setInsertionPointToStart(&block); + // ensure the block is well-formed. + mlirOpBuilder->create( + mlirOpBuilder->getUnknownLoc()); + mlirOpBuilder->restoreInsertionPoint(insertPt); + EXPECT_EQ(succeeded(parallelOp.verify()), true); +} + // main() from gtest_main diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt index b13c84e88a028..041b631f59d6b 100644 --- a/flang/unittests/Runtime/CMakeLists.txt +++ b/flang/unittests/Runtime/CMakeLists.txt @@ -13,30 +13,19 @@ target_link_libraries(RuntimeTesting LLVMSupport ) -add_executable(format-test - format.cpp -) - -target_link_libraries(format-test +add_flang_nongtest_unittest(format RuntimeTesting FortranRuntime LLVMSupport ) -add_test(NAME Format COMMAND format-test) - -add_executable(hello-world - hello.cpp -) - -target_link_libraries(hello-world +add_flang_nongtest_unittest(hello RuntimeTesting FortranRuntime LLVMSupport ) -add_test(NAME HelloWorld COMMAND hello-world) - +# This test is not run by default as it requires input. add_executable(external-hello-world external-hello.cpp ) @@ -46,38 +35,20 @@ target_link_libraries(external-hello-world LLVMSupport ) -add_executable(external-io - external-io.cpp -) - -target_link_libraries(external-io +add_flang_nongtest_unittest(external-io RuntimeTesting FortranRuntime LLVMSupport ) -add_test(NAME ExternalIO COMMAND external-io) - -add_executable(list-input-test - list-input.cpp -) - -target_link_libraries(list-input-test +add_flang_nongtest_unittest(list-input RuntimeTesting FortranRuntime LLVMSupport ) -add_test(NAME ListInput COMMAND list-input-test) - -add_executable(character-test - character.cpp -) - -target_link_libraries(character-test +add_flang_nongtest_unittest(character RuntimeTesting FortranRuntime LLVMSupport ) - -add_test(NAME CharacterTest COMMAND character-test) diff --git a/flang/unittests/Runtime/hello.cpp b/flang/unittests/Runtime/hello.cpp index f6db4a8e47dc8..c38aedf4f6549 100644 --- a/flang/unittests/Runtime/hello.cpp +++ b/flang/unittests/Runtime/hello.cpp @@ -175,6 +175,7 @@ int main() { {"(E32.17E0,';')", " 0.00000000000000000E+0;"}, {"(G32.17E0,';')", " 0.0000000000000000 ;"}, {"(1P,E32.17,';')", " 0.00000000000000000E+00;"}, + {"(1PE32.17,';')", " 0.00000000000000000E+00;"}, // no comma {"(1P,F32.17,';')", " 0.00000000000000000;"}, {"(1P,G32.17,';')", " 0.0000000000000000 ;"}, {"(2P,E32.17,';')", " 00.0000000000000000E+00;"}, @@ -195,6 +196,7 @@ int main() { {"(E32.17E4,';')", " 0.10000000000000000E+0001;"}, {"(G32.17E4,';')", " 1.0000000000000000 ;"}, {"(1P,E32.17,';')", " 1.00000000000000000E+00;"}, + {"(1PE32.17,';')", " 1.00000000000000000E+00;"}, // no comma {"(1P,F32.17,';')", " 0.10000000000000000;"}, {"(1P,G32.17,';')", " 1.0000000000000000 ;"}, {"(ES32.17,';')", " 1.00000000000000000E+00;"}, diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 6a1ff3bd64a9b..8314df89b0636 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -1,18 +1,27 @@ set(TARGET_LIBC_ENTRYPOINTS + # ctype.h entrypoints + libc.src.ctype.isalnum + libc.src.ctype.isalpha + libc.src.ctype.isdigit + libc.src.ctype.islower + libc.src.ctype.isupper + # errno.h entrypoints libc.src.errno.__errno_location # string.h entrypoints libc.src.string.bzero + libc.src.string.memchr libc.src.string.memcpy libc.src.string.memset - libc.src.string.strcpy + libc.src.string.memrchr libc.src.string.strcat - libc.src.string.strlen - libc.src.string.memchr libc.src.string.strchr - libc.src.string.strstr + libc.src.string.strcpy + libc.src.string.strlen libc.src.string.strnlen + libc.src.string.strrchr + libc.src.string.strstr ) set(TARGET_LIBM_ENTRYPOINTS diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index 1ec1a024f85d0..03d5d66e41e79 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -86,6 +86,16 @@ def AssertAPI : PublicAPI<"assert.h"> { ]; } +def CTypeAPI : PublicAPI<"ctype.h"> { + let Functions = [ + "isalnum", + "isalpha", + "isdigit", + "islower", + "isupper", + ]; +} + def MathErrHandlingMacro : MacroDef<"math_errhandling"> { let Defn = [{ #ifndef math_errhandling @@ -194,29 +204,30 @@ def MathAPI : PublicAPI<"math.h"> { def StringAPI : PublicAPI<"string.h"> { let Functions = [ "bzero", + "memchr", + "memcmp", "memcpy", "memmove", - "memcmp", - "memchr", + "memrchr", "memset", - "strcpy", - "strncpy", "strcat", - "strncat", + "strchr", "strcmp", "strcoll", + "strcpy", + "strcspn", + "strerror", + "strlen", + "strncat", "strncmp", - "strxfrm", - "strchr", - "strcspn", + "strncpy", + "strnlen", "strpbrk", "strrchr", "strspn", "strstr", "strtok", - "strerror", - "strlen", - "strnlen" + "strxfrm", ]; let TypeDeclarations = [ diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index b20f58c451847..56a99d00d7847 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1,6 +1,13 @@ set(TARGET_LIBC_ENTRYPOINTS # assert.h entrypoints libc.src.assert.__assert_fail + + # ctype.h entrypoints + libc.src.ctype.isalnum + libc.src.ctype.isalpha + libc.src.ctype.isdigit + libc.src.ctype.islower + libc.src.ctype.isupper # errno.h entrypoints libc.src.errno.__errno_location @@ -21,16 +28,18 @@ set(TARGET_LIBC_ENTRYPOINTS # string.h entrypoints libc.src.string.bzero + libc.src.string.memchr libc.src.string.memcpy + libc.src.string.memrchr libc.src.string.memset - libc.src.string.strcpy libc.src.string.strcat - libc.src.string.strlen - libc.src.string.strcmp - libc.src.string.memchr libc.src.string.strchr - libc.src.string.strstr + libc.src.string.strcmp + libc.src.string.strcpy + libc.src.string.strlen libc.src.string.strnlen + libc.src.string.strrchr + libc.src.string.strstr # sys/mman.h entrypoints libc.src.sys.mman.mmap diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt index b44681a5db4ec..550f172c32c1e 100644 --- a/libc/config/linux/x86_64/headers.txt +++ b/libc/config/linux/x86_64/headers.txt @@ -1,5 +1,6 @@ set(TARGET_PUBLIC_HEADERS libc.include.assert_h + libc.include.ctype libc.include.errno libc.include.math libc.include.signal diff --git a/libc/fuzzing/string/CMakeLists.txt b/libc/fuzzing/string/CMakeLists.txt index db3def324fccd..1885ee5f66ebf 100644 --- a/libc/fuzzing/string/CMakeLists.txt +++ b/libc/fuzzing/string/CMakeLists.txt @@ -1,3 +1,11 @@ +add_libc_fuzzer( + strcmp_fuzz + SRCS + strcmp_fuzz.cpp + DEPENDS + libc.src.string.strcmp +) + add_libc_fuzzer( strcpy_fuzz SRCS @@ -9,9 +17,10 @@ add_libc_fuzzer( ) add_libc_fuzzer( - strcmp_fuzz + strstr_fuzz SRCS - strcmp_fuzz.cpp + strstr_fuzz.cpp DEPENDS - libc.src.string.strcmp + libc.src.string.strstr + libc.src.string.strlen ) diff --git a/libc/fuzzing/string/strstr_fuzz.cpp b/libc/fuzzing/string/strstr_fuzz.cpp new file mode 100644 index 0000000000000..5ccf85c7dc0fc --- /dev/null +++ b/libc/fuzzing/string/strstr_fuzz.cpp @@ -0,0 +1,85 @@ +//===-- strstr_fuzz.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Fuzzing test for llvm-libc strstr implementation. +/// +//===----------------------------------------------------------------------===// + +#include "src/string/strlen.h" +#include "src/string/strstr.h" +#include +#include + +// Simple loop to compare two strings up to a size n. +static int simple_memcmp(const char *left, const char *right, size_t n) { + for (; n && *left == *right; ++left, ++right, --n) + ; + return n ? *left - *right : 0; +} + +// The general structure is to take the value of the first byte, set size1 to +// that value, and add the null terminator. size2 will then contain the rest of +// the bytes in data. +// For example, with inputs (data={2, 6, 4, 8, 0}, size=5): +// size1: data[0] = 2 +// data1: {2, 6} + '\0' = {2, 6, '\0'} +// size2: size - size1 = 3 +// data2: {4, 8, '\0'} +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + // Verify the size is at least 1 and the data is null terminated. + if (!size || data[size - 1] != '\0') + return 0; + const size_t size1 = (data[0] <= size ? data[0] : size); + // The first size will always be at least 1 since + // we need to append the null terminator. The second size + // needs to be checked since it must also contain the null + // terminator. + if (size - size1 == 0) + return 0; + + // Copy the data into a new container. + uint8_t *container = new uint8_t[size1 + 1]; + if (!container) + __builtin_trap(); + + size_t i; + for (i = 0; i < size1; ++i) + container[i] = data[i]; + container[size1] = '\0'; // Add null terminator to container. + + const char *needle = reinterpret_cast(container); + const char *haystack = reinterpret_cast(data + i); + const char *result = __llvm_libc::strstr(haystack, needle); + + // A null terminator may exist earlier in each, so this needs to be recorded. + const size_t haystack_size = __llvm_libc::strlen(haystack); + const size_t needle_size = __llvm_libc::strlen(needle); + + if (result) { + // The needle is in the haystack. + // 1. Verify that the result matches the needle. + if (simple_memcmp(needle, result, needle_size) != 0) + __builtin_trap(); + + const char *haystack_ptr = haystack; + // 2. Verify that the result is the first occurrence of the needle. + for (; haystack_ptr != result; ++haystack_ptr) { + if (simple_memcmp(needle, haystack_ptr, needle_size) == 0) + __builtin_trap(); // There was an earlier occurrence of the needle. + } + } else { + // No result was found. Verify that the needle doesn't exist within the + // haystack. + for (size_t i = 0; i + needle_size < haystack_size; ++i) { + if (simple_memcmp(needle, haystack + i, needle_size) == 0) + __builtin_trap(); // There was an earlier occurrence of the needle. + } + } + delete[] container; + return 0; +} diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 4dcd69a62bfbf..f676342a134e5 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -17,10 +17,10 @@ add_header( __llvm-libc-stdc-types.h ) -add_header( +add_gen_header( ctype - HDR - ctype.h + DEF_FILE ctype.h.def + GEN_HDR ctype.h DEPENDS .llvm_libc_common_h ) diff --git a/libc/include/ctype.h b/libc/include/ctype.h.def similarity index 59% rename from libc/include/ctype.h rename to libc/include/ctype.h.def index 862a1fb71c17f..ac52a36bf72f5 100644 --- a/libc/include/ctype.h +++ b/libc/include/ctype.h.def @@ -1,4 +1,4 @@ -//===-- C standard library header ctype.h ---------------------------------===// +//===-- C standard library header ctype.h --------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,36 +11,6 @@ #include <__llvm-libc-common.h> -__BEGIN_C_DECLS - -int isalnum(int); - -int isalpha(int); - -int isblank(int); - -int iscntrl(int); - -int isdigit(int); - -int isgraph(int); - -int islower(int); - -int isprint(int); - -int ispunct(int); - -int isspace(int); - -int isupper(int); - -int isxdigit(int); - -int tolower(int); - -int toupper(int); - -__END_C_DECLS +%%public_api() #endif // LLVM_LIBC_CTYPE_H diff --git a/libc/spec/gnu_ext.td b/libc/spec/gnu_ext.td index 7ac99783bc470..d85c562d9256a 100644 --- a/libc/spec/gnu_ext.td +++ b/libc/spec/gnu_ext.td @@ -12,8 +12,22 @@ def GnuExtensions : StandardSpec<"GNUExtensions"> { >, ] >; + + HeaderSpec String = HeaderSpec< + "string.h", + [], // Macros + [], // Types + [], // Enumerations + [ + FunctionSpec< + "memrchr", + RetValSpec, + [ArgSpec, ArgSpec, ArgSpec] + >, + ] + >; let Headers = [ - Math, + Math, String, ]; } diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 6a11b002d8742..4fffc5cdc8579 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -39,7 +39,41 @@ def StdC : StandardSpec<"stdc"> { [], // Enumerations [] >; - + + HeaderSpec CType = HeaderSpec< + "ctype.h", + [], // Macros + [], // Types + [], // Enumerations + [ + FunctionSpec< + "isalnum", + RetValSpec, + [ArgSpec] + >, + FunctionSpec< + "isalpha", + RetValSpec, + [ArgSpec] + >, + FunctionSpec< + "isdigit", + RetValSpec, + [ArgSpec] + >, + FunctionSpec< + "islower", + RetValSpec, + [ArgSpec] + >, + FunctionSpec< + "isupper", + RetValSpec, + [ArgSpec] + >, + ] + >; + HeaderSpec String = HeaderSpec< "string.h", [ @@ -386,6 +420,7 @@ def StdC : StandardSpec<"stdc"> { let Headers = [ Assert, + CType, Errno, Math, String, diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt index 88d2829d656a0..9e5b06f201ce2 100644 --- a/libc/src/CMakeLists.txt +++ b/libc/src/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(assert) +add_subdirectory(ctype) add_subdirectory(errno) add_subdirectory(math) add_subdirectory(signal) diff --git a/libc/src/ctype/CMakeLists.txt b/libc/src/ctype/CMakeLists.txt new file mode 100644 index 0000000000000..30995ab2f714c --- /dev/null +++ b/libc/src/ctype/CMakeLists.txt @@ -0,0 +1,51 @@ +add_header_library( + ctype_utils + HDRS + ctype_utils.h +) + +add_entrypoint_object( + isalnum + SRCS + isalnum.cpp + HDRS + isalnum.h + DEPENDS + .ctype_utils +) + +add_entrypoint_object( + isalpha + SRCS + isalpha.cpp + HDRS + isalpha.h + DEPENDS + .ctype_utils +) + +add_entrypoint_object( + isdigit + SRCS + isdigit.cpp + HDRS + isdigit.h + DEPENDS + .ctype_utils +) + +add_entrypoint_object( + islower + SRCS + islower.cpp + HDRS + islower.h +) + +add_entrypoint_object( + isupper + SRCS + isupper.cpp + HDRS + isupper.h +) diff --git a/libc/src/ctype/ctype_utils.h b/libc/src/ctype/ctype_utils.h new file mode 100644 index 0000000000000..4e8d3960bb704 --- /dev/null +++ b/libc/src/ctype/ctype_utils.h @@ -0,0 +1,34 @@ +//===-- Collection of utils for implementing ctype functions-------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_CTYPE_UTILS_H +#define LLVM_LIBC_SRC_CTYPE_CTYPE_UTILS_H + +namespace __llvm_libc { +namespace internal { + +// ------------------------------------------------------ +// Rationale: Since these classification functions are +// called in other functions, we will avoid the overhead +// of a function call by inlining them. +// ------------------------------------------------------ + +static inline int isdigit(int c) { + const unsigned ch = c; + return (ch - '0') < 10; +} + +static inline int isalpha(int c) { + const unsigned ch = c; + return (ch | 32) - 'a' < 26; +} + +} // namespace internal +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_CTYPE_CTYPE_UTILS_H diff --git a/libc/src/ctype/isalnum.cpp b/libc/src/ctype/isalnum.cpp new file mode 100644 index 0000000000000..08b6520e44267 --- /dev/null +++ b/libc/src/ctype/isalnum.cpp @@ -0,0 +1,22 @@ +//===-- Implementation of isalnum------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isalnum.h" +#include "src/ctype/ctype_utils.h" + +#include "src/__support/common.h" + +namespace __llvm_libc { + +// TODO: Currently restricted to default locale. +// These should be extended using locale information. +int LLVM_LIBC_ENTRYPOINT(isalnum)(int c) { + return internal::isalpha(c) || internal::isdigit(c); +} + +} // namespace __llvm_libc diff --git a/libc/src/ctype/isalnum.h b/libc/src/ctype/isalnum.h new file mode 100644 index 0000000000000..beb202a7529da --- /dev/null +++ b/libc/src/ctype/isalnum.h @@ -0,0 +1,18 @@ +//===-- Implementation header for isalnum -------------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISALNUM_H +#define LLVM_LIBC_SRC_CTYPE_ISALNUM_H + +namespace __llvm_libc { + +int isalnum(int c); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_CTYPE_ISALNUM_H diff --git a/libc/src/ctype/isalpha.cpp b/libc/src/ctype/isalpha.cpp new file mode 100644 index 0000000000000..4b3725477a7d4 --- /dev/null +++ b/libc/src/ctype/isalpha.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of isalpha------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isalpha.h" + +#include "src/__support/common.h" +#include "src/ctype/ctype_utils.h" + +namespace __llvm_libc { + +// TODO: Currently restricted to default locale. +// These should be extended using locale information. +int LLVM_LIBC_ENTRYPOINT(isalpha)(int c) { return internal::isalpha(c); } + +} // namespace __llvm_libc diff --git a/libc/src/ctype/isalpha.h b/libc/src/ctype/isalpha.h new file mode 100644 index 0000000000000..d5697a39e9aa5 --- /dev/null +++ b/libc/src/ctype/isalpha.h @@ -0,0 +1,18 @@ +//===-- Implementation header for isalpha -------------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISALPHA_H +#define LLVM_LIBC_SRC_CTYPE_ISALPHA_H + +namespace __llvm_libc { + +int isalpha(int c); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_CTYPE_ISALPHA_H diff --git a/libc/src/ctype/isdigit.cpp b/libc/src/ctype/isdigit.cpp new file mode 100644 index 0000000000000..94ec42a0a3d8d --- /dev/null +++ b/libc/src/ctype/isdigit.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of isdigit------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isdigit.h" +#include "src/__support/common.h" +#include "src/ctype/ctype_utils.h" + +namespace __llvm_libc { + +// TODO: Currently restricted to default locale. +// These should be extended using locale information. +int LLVM_LIBC_ENTRYPOINT(isdigit)(int c) { return internal::isdigit(c); } + +} // namespace __llvm_libc diff --git a/libc/src/ctype/isdigit.h b/libc/src/ctype/isdigit.h new file mode 100644 index 0000000000000..32a76235e0592 --- /dev/null +++ b/libc/src/ctype/isdigit.h @@ -0,0 +1,18 @@ +//===-- Implementation header for isdigit -------------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISDIGIT_H +#define LLVM_LIBC_SRC_CTYPE_ISDIGIT_H + +namespace __llvm_libc { + +int isdigit(int c); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_CTYPE_ISDIGIT_H diff --git a/libc/src/ctype/islower.cpp b/libc/src/ctype/islower.cpp new file mode 100644 index 0000000000000..df21355f31ac6 --- /dev/null +++ b/libc/src/ctype/islower.cpp @@ -0,0 +1,22 @@ +//===-- Implementation of islower------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/islower.h" + +#include "src/__support/common.h" + +namespace __llvm_libc { + +// TODO: Currently restricted to default locale. +// These should be extended using locale information. +int LLVM_LIBC_ENTRYPOINT(islower)(int c) { + const unsigned ch = c; + return (ch - 'a') < 26; +} + +} // namespace __llvm_libc diff --git a/libc/src/ctype/islower.h b/libc/src/ctype/islower.h new file mode 100644 index 0000000000000..7643542fb7a99 --- /dev/null +++ b/libc/src/ctype/islower.h @@ -0,0 +1,18 @@ +//===-- Implementation header for islower -------------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISLOWER_H +#define LLVM_LIBC_SRC_CTYPE_ISLOWER_H + +namespace __llvm_libc { + +int islower(int c); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_CTYPE_ISLOWER_H diff --git a/libc/src/ctype/isupper.cpp b/libc/src/ctype/isupper.cpp new file mode 100644 index 0000000000000..57aed961d1e57 --- /dev/null +++ b/libc/src/ctype/isupper.cpp @@ -0,0 +1,22 @@ +//===-- Implementation of isupper------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isupper.h" + +#include "src/__support/common.h" + +namespace __llvm_libc { + +// TODO: Currently restricted to default locale. +// These should be extended using locale information. +int LLVM_LIBC_ENTRYPOINT(isupper)(int c) { + const unsigned ch = c; + return (ch - 'A') < 26; +} + +} // namespace __llvm_libc diff --git a/libc/src/ctype/isupper.h b/libc/src/ctype/isupper.h new file mode 100644 index 0000000000000..7a1f2270943a9 --- /dev/null +++ b/libc/src/ctype/isupper.h @@ -0,0 +1,18 @@ +//===-- Implementation header for isupper -------------------------*-C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_CTYPE_ISUPPER_H +#define LLVM_LIBC_SRC_CTYPE_ISUPPER_H + +namespace __llvm_libc { + +int isupper(int c); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_CTYPE_ISUPPER_H diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 8bd7c1c045cf4..d0eab632e9d79 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -78,6 +78,22 @@ add_entrypoint_object( .memchr ) +add_entrypoint_object( + memrchr + SRCS + memrchr.cpp + HDRS + memrchr.h +) + +add_entrypoint_object( + strrchr + SRCS + strrchr.cpp + HDRS + strrchr.h +) + # Helper to define a function with multiple implementations # - Computes flags to satisfy required/rejected features and arch, # - Declares an entry point, diff --git a/libc/src/string/memrchr.cpp b/libc/src/string/memrchr.cpp new file mode 100644 index 0000000000000..81b034505202e --- /dev/null +++ b/libc/src/string/memrchr.cpp @@ -0,0 +1,26 @@ +//===-- Implementation of memrchr -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memrchr.h" +#include "src/__support/common.h" +#include + +namespace __llvm_libc { + +void *LLVM_LIBC_ENTRYPOINT(memrchr)(const void *src, int c, size_t n) { + const unsigned char *str = reinterpret_cast(src); + const unsigned char ch = c; + for (; n != 0; --n) { + const unsigned char *s = str + n - 1; + if (*s == ch) + return const_cast(s); + } + return nullptr; +} + +} // namespace __llvm_libc diff --git a/libc/src/string/memrchr.h b/libc/src/string/memrchr.h new file mode 100644 index 0000000000000..8f43577e331fe --- /dev/null +++ b/libc/src/string/memrchr.h @@ -0,0 +1,20 @@ +//===-- Implementation header for memrchr -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMRCHR_H +#define LLVM_LIBC_SRC_STRING_MEMRCHR_H + +#include + +namespace __llvm_libc { + +void *memrchr(const void *src, int c, size_t n); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMRCHR_H diff --git a/libc/src/string/strrchr.cpp b/libc/src/string/strrchr.cpp new file mode 100644 index 0000000000000..374a802fbb9e2 --- /dev/null +++ b/libc/src/string/strrchr.cpp @@ -0,0 +1,25 @@ +//===-- Implementation of strrchr------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/strrchr.h" + +#include "src/__support/common.h" + +namespace __llvm_libc { + +char *LLVM_LIBC_ENTRYPOINT(strrchr)(const char *src, int c) { + const char ch = c; + char *last_occurrence = nullptr; + do { + if (*src == ch) + last_occurrence = const_cast(src); + } while (*src++); + return last_occurrence; +} + +} // namespace __llvm_libc diff --git a/libc/src/string/strrchr.h b/libc/src/string/strrchr.h new file mode 100644 index 0000000000000..7b85929b290b7 --- /dev/null +++ b/libc/src/string/strrchr.h @@ -0,0 +1,18 @@ +//===-- Implementation header for strrchr -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_STRRCHR_H +#define LLVM_LIBC_SRC_STRING_STRRCHR_H + +namespace __llvm_libc { + +char *strrchr(const char *src, int c); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_STRRCHR_H diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt index 500b294ad3866..edce4bbf78cc0 100644 --- a/libc/test/CMakeLists.txt +++ b/libc/test/CMakeLists.txt @@ -1,3 +1,9 @@ +add_header_library( + errno_setter_matcher + HDRS + ErrnoSetterMatcher.h +) + add_custom_target(check-libc) add_subdirectory(config) diff --git a/libc/utils/UnitTest/ErrnoSetterMatcher.h b/libc/test/ErrnoSetterMatcher.h similarity index 84% rename from libc/utils/UnitTest/ErrnoSetterMatcher.h rename to libc/test/ErrnoSetterMatcher.h index d676ee9cbc917..7f8311bfd5e6c 100644 --- a/libc/utils/UnitTest/ErrnoSetterMatcher.h +++ b/libc/test/ErrnoSetterMatcher.h @@ -6,15 +6,11 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_UTILS_UNITTEST_ERRNOSETTERMATCHER_H -#define LLVM_LIBC_UTILS_UNITTEST_ERRNOSETTERMATCHER_H +#ifndef LLVM_LIBC_TEST_ERRNOSETTERMATCHER_H +#define LLVM_LIBC_TEST_ERRNOSETTERMATCHER_H -#include "Test.h" - -// Using LLVM libc headers in UnitTest is not ideal however we also want the -// test/ directory to have the same layout as libc/ so there is no clean place -// to put this file except for in utils/UnitTest/. #include "src/errno/llvmlibc_errno.h" +#include "utils/UnitTest/Test.h" namespace __llvm_libc { namespace testing { @@ -73,4 +69,4 @@ static internal::ErrnoSetterMatcher Fails(int ExpectedErrno, } // namespace testing } // namespace __llvm_libc -#endif // LLVM_LIBC_UTILS_UNITTEST_ERRNOSETTERMATCHER_H +#endif // LLVM_LIBC_TEST_ERRNOSETTERMATCHER_H diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index 3e7bad72482be..e6390fc7a1d65 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(assert) +add_subdirectory(ctype) add_subdirectory(errno) add_subdirectory(math) add_subdirectory(signal) @@ -91,4 +92,4 @@ endif() target_link_libraries(libc-integration-test PRIVATE ${library_files} -) \ No newline at end of file +) diff --git a/libc/test/src/ctype/CMakeLists.txt b/libc/test/src/ctype/CMakeLists.txt new file mode 100644 index 0000000000000..c9959465c697c --- /dev/null +++ b/libc/test/src/ctype/CMakeLists.txt @@ -0,0 +1,51 @@ +add_libc_testsuite(libc_ctype_unittests) + +add_libc_unittest( + isalnum + SUITE + libc_ctype_unittests + SRCS + isalnum_test.cpp + DEPENDS + libc.src.ctype.isalnum +) + +add_libc_unittest( + isalpha + SUITE + libc_ctype_unittests + SRCS + isalpha_test.cpp + DEPENDS + libc.src.ctype.isalpha +) + +add_libc_unittest( + isdigit + SUITE + libc_ctype_unittests + SRCS + isdigit_test.cpp + DEPENDS + libc.src.ctype.isdigit +) + +add_libc_unittest( + islower + SUITE + libc_ctype_unittests + SRCS + islower_test.cpp + DEPENDS + libc.src.ctype.islower +) + +add_libc_unittest( + isupper + SUITE + libc_ctype_unittests + SRCS + isupper_test.cpp + DEPENDS + libc.src.ctype.isupper +) diff --git a/libc/test/src/ctype/isalnum_test.cpp b/libc/test/src/ctype/isalnum_test.cpp new file mode 100644 index 0000000000000..ca77285c5614e --- /dev/null +++ b/libc/test/src/ctype/isalnum_test.cpp @@ -0,0 +1,23 @@ +//===-- Unittests for isalnum----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isalnum.h" + +#include "utils/UnitTest/Test.h" + +TEST(IsAlNum, DefaultLocale) { + // Loops through all characters, verifying that numbers and letters + // return non-zero integer and everything else returns a zero. + for (int c = 0; c < 255; ++c) { + if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || + ('0' <= c && c <= '9')) + EXPECT_NE(__llvm_libc::isalnum(c), 0); + else + EXPECT_EQ(__llvm_libc::isalnum(c), 0); + } +} diff --git a/libc/test/src/ctype/isalpha_test.cpp b/libc/test/src/ctype/isalpha_test.cpp new file mode 100644 index 0000000000000..d91219b504063 --- /dev/null +++ b/libc/test/src/ctype/isalpha_test.cpp @@ -0,0 +1,22 @@ +//===-- Unittests for isalpha----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isalpha.h" + +#include "utils/UnitTest/Test.h" + +TEST(IsAlpha, DefaultLocale) { + // Loops through all characters, verifying that letters return a + // non-zero integer and everything else returns zero. + for (int ch = 0; ch < 255; ++ch) { + if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) + EXPECT_NE(__llvm_libc::isalpha(ch), 0); + else + EXPECT_EQ(__llvm_libc::isalpha(ch), 0); + } +} diff --git a/libc/test/src/ctype/isdigit_test.cpp b/libc/test/src/ctype/isdigit_test.cpp new file mode 100644 index 0000000000000..2430a92425c4a --- /dev/null +++ b/libc/test/src/ctype/isdigit_test.cpp @@ -0,0 +1,22 @@ +//===-- Unittests for isdigit----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isdigit.h" + +#include "utils/UnitTest/Test.h" + +TEST(IsDigit, DefaultLocale) { + // Loops through all characters, verifying that numbers return a + // non-zero integer and everything else returns zero. + for (int ch = 0; ch < 255; ++ch) { + if ('0' <= ch && ch <= '9') + EXPECT_NE(__llvm_libc::isdigit(ch), 0); + else + EXPECT_EQ(__llvm_libc::isdigit(ch), 0); + } +} diff --git a/libc/test/src/ctype/islower_test.cpp b/libc/test/src/ctype/islower_test.cpp new file mode 100644 index 0000000000000..9b38cabc67aa7 --- /dev/null +++ b/libc/test/src/ctype/islower_test.cpp @@ -0,0 +1,21 @@ +//===-- Unittests for islower----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/islower.h" +#include "utils/UnitTest/Test.h" + +TEST(IsLower, DefaultLocale) { + // Loops through all characters, verifying that lowercase letters + // return a non-zero integer and everything else returns zero. + for (int ch = 0; ch < 255; ++ch) { + if ('a' <= ch && ch <= 'z') + EXPECT_NE(__llvm_libc::islower(ch), 0); + else + EXPECT_EQ(__llvm_libc::islower(ch), 0); + } +} diff --git a/libc/test/src/ctype/isupper_test.cpp b/libc/test/src/ctype/isupper_test.cpp new file mode 100644 index 0000000000000..0a13f4e11b0e1 --- /dev/null +++ b/libc/test/src/ctype/isupper_test.cpp @@ -0,0 +1,21 @@ +//===-- Unittests for isupper----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/ctype/isupper.h" +#include "utils/UnitTest/Test.h" + +TEST(IsUpper, DefaultLocale) { + // Loops through all characters, verifying that uppercase letters + // return a non-zero integer and everything else returns zero. + for (int ch = 0; ch < 255; ++ch) { + if ('A' <= ch && ch <= 'Z') + EXPECT_NE(__llvm_libc::isupper(ch), 0); + else + EXPECT_EQ(__llvm_libc::isupper(ch), 0); + } +} diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt index b5c1281b81023..c5baaf7d9d550 100644 --- a/libc/test/src/signal/CMakeLists.txt +++ b/libc/test/src/signal/CMakeLists.txt @@ -22,6 +22,7 @@ add_libc_unittest( libc.include.signal libc.src.signal.raise libc.src.signal.sigaction + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -37,6 +38,7 @@ add_libc_unittest( libc.src.signal.sigaddset libc.src.signal.sigemptyset libc.src.signal.sigprocmask + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -49,6 +51,7 @@ add_libc_unittest( libc.include.errno libc.include.signal libc.src.signal.sigaddset + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -63,6 +66,7 @@ add_libc_unittest( libc.src.errno.__errno_location libc.src.signal.raise libc.src.signal.signal + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -77,6 +81,7 @@ add_libc_unittest( libc.src.signal.raise libc.src.signal.sigfillset libc.src.signal.sigprocmask + libc.test.errno_setter_matcher ) add_libc_unittest( @@ -92,4 +97,5 @@ add_libc_unittest( libc.src.signal.sigdelset libc.src.signal.sigfillset libc.src.signal.sigprocmask + libc.test.errno_setter_matcher ) diff --git a/libc/test/src/signal/sigaction_test.cpp b/libc/test/src/signal/sigaction_test.cpp index 726d76b3795de..a473b646072f6 100644 --- a/libc/test/src/signal/sigaction_test.cpp +++ b/libc/test/src/signal/sigaction_test.cpp @@ -12,7 +12,7 @@ #include "src/signal/raise.h" #include "src/signal/sigaction.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" using __llvm_libc::testing::ErrnoSetterMatcher::Fails; diff --git a/libc/test/src/signal/sigaddset_test.cpp b/libc/test/src/signal/sigaddset_test.cpp index f106edb57f905..922110b397b6c 100644 --- a/libc/test/src/signal/sigaddset_test.cpp +++ b/libc/test/src/signal/sigaddset_test.cpp @@ -10,7 +10,7 @@ #include "include/signal.h" #include "src/signal/sigaddset.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" // This tests invalid inputs and ensures errno is properly set. diff --git a/libc/test/src/signal/sigdelset_test.cpp b/libc/test/src/signal/sigdelset_test.cpp index 48e0e6f3f5c4a..42ba9335e219d 100644 --- a/libc/test/src/signal/sigdelset_test.cpp +++ b/libc/test/src/signal/sigdelset_test.cpp @@ -13,7 +13,7 @@ #include "src/signal/sigfillset.h" #include "src/signal/sigprocmask.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" TEST(Sigdelset, Invalid) { diff --git a/libc/test/src/signal/sigfillset_test.cpp b/libc/test/src/signal/sigfillset_test.cpp index c21bf137d2836..16f87e641fbe5 100644 --- a/libc/test/src/signal/sigfillset_test.cpp +++ b/libc/test/src/signal/sigfillset_test.cpp @@ -12,7 +12,7 @@ #include "src/signal/sigfillset.h" #include "src/signal/sigprocmask.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" TEST(Sigfillset, Invalid) { diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp index 8db26e91d670b..03bf22a5f3eb9 100644 --- a/libc/test/src/signal/signal_test.cpp +++ b/libc/test/src/signal/signal_test.cpp @@ -12,7 +12,7 @@ #include "src/signal/raise.h" #include "src/signal/signal.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" using __llvm_libc::testing::ErrnoSetterMatcher::Fails; diff --git a/libc/test/src/signal/sigprocmask_test.cpp b/libc/test/src/signal/sigprocmask_test.cpp index 824415b7267cd..90b57e9e0f2f0 100644 --- a/libc/test/src/signal/sigprocmask_test.cpp +++ b/libc/test/src/signal/sigprocmask_test.cpp @@ -14,7 +14,7 @@ #include "src/signal/sigemptyset.h" #include "src/signal/sigprocmask.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" class SignalTest : public __llvm_libc::testing::Test { diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt index be43cc912b5a3..0fff250fabd15 100644 --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -82,6 +82,26 @@ add_libc_unittest( libc.src.string.strnlen ) +add_libc_unittest( + memrchr_test + SUITE + libc_string_unittests + SRCS + memrchr_test.cpp + DEPENDS + libc.src.string.memrchr +) + +add_libc_unittest( + strrchr_test + SUITE + libc_string_unittests + SRCS + strrchr_test.cpp + DEPENDS + libc.src.string.strrchr +) + # Tests all implementations that can run on the host. function(add_libc_multi_impl_test name) get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) diff --git a/libc/test/src/string/memrchr_test.cpp b/libc/test/src/string/memrchr_test.cpp new file mode 100644 index 0000000000000..5f5f7a0d01828 --- /dev/null +++ b/libc/test/src/string/memrchr_test.cpp @@ -0,0 +1,114 @@ +//===-- Unittests for memrchr ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memrchr.h" +#include "utils/UnitTest/Test.h" +#include + +// A helper function that calls memrchr and abstracts away the explicit cast for +// readability purposes. +const char *call_memrchr(const void *src, int c, size_t size) { + return reinterpret_cast(__llvm_libc::memrchr(src, c, size)); +} + +TEST(MemRChrTest, FindsCharacterAfterNullTerminator) { + // memrchr should continue searching after a null terminator. + const size_t size = 6; + const unsigned char src[size] = {'a', '\0', 'b', 'c', 'd', '\0'}; + // Should return 'b', 'c', 'd', '\0' even when after null terminator. + ASSERT_STREQ(call_memrchr(src, 'b', size), "bcd"); +} + +TEST(MemRChrTest, FindsCharacterInNonNullTerminatedCollection) { + const size_t size = 3; + const unsigned char src[size] = {'a', 'b', 'c'}; + // Should return 'b', 'c'. + const char *ret = call_memrchr(src, 'b', size); + ASSERT_EQ(ret[0], 'b'); + ASSERT_EQ(ret[1], 'c'); +} + +TEST(MemRChrTest, FindsFirstCharacter) { + const size_t size = 6; + const unsigned char src[size] = {'a', 'b', 'c', 'd', 'e', '\0'}; + // Should return original array since 'a' is the first character. + ASSERT_STREQ(call_memrchr(src, 'a', size), "abcde"); +} + +TEST(MemRChrTest, FindsMiddleCharacter) { + const size_t size = 6; + const unsigned char src[size] = {'a', 'b', 'c', 'd', 'e', '\0'}; + // Should return characters after (and including) 'c'. + ASSERT_STREQ(call_memrchr(src, 'c', size), "cde"); +} + +TEST(MemRChrTest, FindsLastCharacterThatIsNotNullTerminator) { + const size_t size = 6; + const unsigned char src[size] = {'a', 'b', 'c', 'd', 'e', '\0'}; + // Should return 'e' and null-terminator. + ASSERT_STREQ(call_memrchr(src, 'e', size), "e"); +} + +TEST(MemRChrTest, FindsNullTerminator) { + const size_t size = 6; + const unsigned char src[size] = {'a', 'b', 'c', 'd', 'e', '\0'}; + // Should return null terminator. + ASSERT_STREQ(call_memrchr(src, '\0', size), ""); +} + +TEST(MemRChrTest, CharacterNotWithinStringShouldReturnNullptr) { + const size_t size = 4; + const unsigned char src[size] = {'1', '2', '3', '?'}; + // Since 'z' is not within 'characters', should return nullptr. + ASSERT_STREQ(call_memrchr(src, 'z', size), nullptr); +} + +TEST(MemRChrTest, CharacterNotWithinSizeShouldReturnNullptr) { + const unsigned char src[5] = {'1', '2', '3', '4', '\0'}; + // Since '4' is not within the first 2 characters, this should return nullptr. + const size_t size = 2; + ASSERT_STREQ(call_memrchr(src, '4', size), nullptr); +} + +TEST(MemRChrTest, ShouldFindLastOfDuplicates) { + size_t size = 12; // 11 characters + null terminator. + const char *dups = "abc1def1ghi"; + // 1 is duplicated in 'dups', but it should find the last copy. + ASSERT_STREQ(call_memrchr(dups, '1', size), "1ghi"); + + const char *repeated = "XXXXX"; + size = 6; // 5 characters + null terminator. + // Should return the last X with the null terminator. + ASSERT_STREQ(call_memrchr(repeated, 'X', size), "X"); +} + +TEST(MemRChrTest, EmptyStringShouldOnlyMatchNullTerminator) { + const size_t size = 1; // Null terminator. + const char *empty_string = ""; + // Null terminator should match. + ASSERT_STREQ(call_memrchr(empty_string, '\0', size), ""); + // All other characters should not match. + ASSERT_STREQ(call_memrchr(empty_string, 'A', size), nullptr); + ASSERT_STREQ(call_memrchr(empty_string, '9', size), nullptr); + ASSERT_STREQ(call_memrchr(empty_string, '?', size), nullptr); +} + +TEST(MemRChrTest, SignedCharacterFound) { + char c = -1; + const size_t size = 1; + char src[size] = {c}; + const char *actual = call_memrchr(src, c, size); + // Should find the last character 'c'. + ASSERT_EQ(actual[0], c); +} + +TEST(MemRChrTest, ZeroLengthShouldReturnNullptr) { + const unsigned char src[4] = {'a', 'b', 'c', '\0'}; + // This will iterate over exactly zero characters, so should return nullptr. + ASSERT_STREQ(call_memrchr(src, 'd', 0), nullptr); +} diff --git a/libc/test/src/string/strchr_test.cpp b/libc/test/src/string/strchr_test.cpp index 37b3733857997..dda930c0c1552 100644 --- a/libc/test/src/string/strchr_test.cpp +++ b/libc/test/src/string/strchr_test.cpp @@ -11,42 +11,38 @@ TEST(StrChrTest, FindsFirstCharacter) { const char *src = "abcde"; - const char *src_copy = src; // Should return original string since 'a' is the first character. ASSERT_STREQ(__llvm_libc::strchr(src, 'a'), "abcde"); // Source string should not change. - ASSERT_STREQ(src, src_copy); + ASSERT_STREQ(src, "abcde"); } TEST(StrChrTest, FindsMiddleCharacter) { const char *src = "abcde"; - const char *src_copy = src; // Should return characters after (and including) 'c'. ASSERT_STREQ(__llvm_libc::strchr(src, 'c'), "cde"); // Source string should not change. - ASSERT_STREQ(src, src_copy); + ASSERT_STREQ(src, "abcde"); } TEST(StrChrTest, FindsLastCharacterThatIsNotNullTerminator) { const char *src = "abcde"; - const char *src_copy = src; // Should return 'e' and null-terminator. ASSERT_STREQ(__llvm_libc::strchr(src, 'e'), "e"); // Source string should not change. - ASSERT_STREQ(src, src_copy); + ASSERT_STREQ(src, "abcde"); } TEST(StrChrTest, FindsNullTerminator) { const char *src = "abcde"; - const char *src_copy = src; // Should return null terminator. ASSERT_STREQ(__llvm_libc::strchr(src, '\0'), ""); // Source string should not change. - ASSERT_STREQ(src, src_copy); + ASSERT_STREQ(src, "abcde"); } TEST(StrChrTest, CharacterNotWithinStringShouldReturnNullptr) { @@ -56,16 +52,15 @@ TEST(StrChrTest, CharacterNotWithinStringShouldReturnNullptr) { TEST(StrChrTest, TheSourceShouldNotChange) { const char *src = "abcde"; - const char *src_copy = src; // When the character is found, the source string should not change. __llvm_libc::strchr(src, 'd'); - ASSERT_STREQ(src, src_copy); + ASSERT_STREQ(src, "abcde"); // Same case for when the character is not found. __llvm_libc::strchr(src, 'z'); - ASSERT_STREQ(src, src_copy); + ASSERT_STREQ(src, "abcde"); // Same case for when looking for nullptr. __llvm_libc::strchr(src, '\0'); - ASSERT_STREQ(src, src_copy); + ASSERT_STREQ(src, "abcde"); } TEST(StrChrTest, ShouldFindFirstOfDuplicates) { diff --git a/libc/test/src/string/strrchr_test.cpp b/libc/test/src/string/strrchr_test.cpp new file mode 100644 index 0000000000000..5ed83aa64bfbc --- /dev/null +++ b/libc/test/src/string/strrchr_test.cpp @@ -0,0 +1,80 @@ +//===-- Unittests for strrchr ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/strrchr.h" +#include "utils/UnitTest/Test.h" + +TEST(StrRChrTest, FindsFirstCharacter) { + const char *src = "abcde"; + + // Should return original string since 'a' is the first character. + ASSERT_STREQ(__llvm_libc::strrchr(src, 'a'), "abcde"); + // Source string should not change. + ASSERT_STREQ(src, "abcde"); +} + +TEST(StrRChrTest, FindsMiddleCharacter) { + const char *src = "abcde"; + + // Should return characters after (and including) 'c'. + ASSERT_STREQ(__llvm_libc::strrchr(src, 'c'), "cde"); + // Source string should not change. + ASSERT_STREQ(src, "abcde"); +} + +TEST(StrRChrTest, FindsLastCharacterThatIsNotNullTerminator) { + const char *src = "abcde"; + + // Should return 'e' and null-terminator. + ASSERT_STREQ(__llvm_libc::strrchr(src, 'e'), "e"); + // Source string should not change. + ASSERT_STREQ(src, "abcde"); +} + +TEST(StrRChrTest, FindsNullTerminator) { + const char *src = "abcde"; + + // Should return null terminator. + ASSERT_STREQ(__llvm_libc::strrchr(src, '\0'), ""); + // Source string should not change. + ASSERT_STREQ(src, "abcde"); +} + +TEST(StrRChrTest, FindsLastBehindFirstNullTerminator) { + const char src[6] = {'a', 'a', '\0', 'b', '\0', 'c'}; + // 'b' is behind a null terminator, so should not be found. + ASSERT_STREQ(__llvm_libc::strrchr(src, 'b'), nullptr); + // Same goes for 'c'. + ASSERT_STREQ(__llvm_libc::strrchr(src, 'c'), nullptr); + + // Should find the second of the two a's. + ASSERT_STREQ(__llvm_libc::strrchr(src, 'a'), "a"); +} + +TEST(StrRChrTest, CharacterNotWithinStringShouldReturnNullptr) { + // Since 'z' is not within the string, should return nullptr. + ASSERT_STREQ(__llvm_libc::strrchr("123?", 'z'), nullptr); +} + +TEST(StrRChrTest, ShouldFindLastOfDuplicates) { + // '1' is duplicated in the string, but it should find the last copy. + ASSERT_STREQ(__llvm_libc::strrchr("abc1def1ghi", '1'), "1ghi"); + + const char *dups = "XXXXX"; + // Should return the last occurrence of 'X'. + ASSERT_STREQ(__llvm_libc::strrchr(dups, 'X'), "X"); +} + +TEST(StrRChrTest, EmptyStringShouldOnlyMatchNullTerminator) { + // Null terminator should match. + ASSERT_STREQ(__llvm_libc::strrchr("", '\0'), ""); + // All other characters should not match. + ASSERT_STREQ(__llvm_libc::strrchr("", 'A'), nullptr); + ASSERT_STREQ(__llvm_libc::strrchr("", '2'), nullptr); + ASSERT_STREQ(__llvm_libc::strrchr("", '*'), nullptr); +} diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt index 3c53e4b60107e..c7d3dde7d9751 100644 --- a/libc/test/src/sys/mman/linux/CMakeLists.txt +++ b/libc/test/src/sys/mman/linux/CMakeLists.txt @@ -12,4 +12,5 @@ add_libc_unittest( libc.src.errno.__errno_location libc.src.sys.mman.mmap libc.src.sys.mman.munmap + libc.test.errno_setter_matcher ) diff --git a/libc/test/src/sys/mman/linux/mmap_test.cpp b/libc/test/src/sys/mman/linux/mmap_test.cpp index 38478c77b18bd..8f91c2b548e01 100644 --- a/libc/test/src/sys/mman/linux/mmap_test.cpp +++ b/libc/test/src/sys/mman/linux/mmap_test.cpp @@ -11,7 +11,7 @@ #include "src/errno/llvmlibc_errno.h" #include "src/sys/mman/mmap.h" #include "src/sys/mman/munmap.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" using __llvm_libc::testing::ErrnoSetterMatcher::Fails; diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index 5eae9c8b478c0..54e513504b3bf 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -7,7 +7,8 @@ add_libc_unittest( SRCS write_test.cpp DEPENDS - libc.src.unistd.write libc.include.errno libc.include.unistd + libc.src.unistd.write + libc.test.errno_setter_matcher ) diff --git a/libc/test/src/unistd/write_test.cpp b/libc/test/src/unistd/write_test.cpp index 2a91ef6fc277f..a2c4cfca05c75 100644 --- a/libc/test/src/unistd/write_test.cpp +++ b/libc/test/src/unistd/write_test.cpp @@ -8,7 +8,7 @@ #include "include/errno.h" #include "src/unistd/write.h" -#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "test/ErrnoSetterMatcher.h" #include "utils/UnitTest/Test.h" #include "utils/testutils/FDReader.h" diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt index c6e877fceef7c..a55182065d20f 100644 --- a/libc/utils/CMakeLists.txt +++ b/libc/utils/CMakeLists.txt @@ -1,6 +1,8 @@ add_subdirectory(CPP) add_subdirectory(FPUtil) +add_subdirectory(LibcTableGenUtil) add_subdirectory(HdrGen) add_subdirectory(MPFRWrapper) add_subdirectory(testutils) +add_subdirectory(tools) add_subdirectory(UnitTest) diff --git a/libc/utils/FPUtil/LongDoubleBitsX86.h b/libc/utils/FPUtil/LongDoubleBitsX86.h index 5438e0b2b6edb..4d64490c5a11d 100644 --- a/libc/utils/FPUtil/LongDoubleBitsX86.h +++ b/libc/utils/FPUtil/LongDoubleBitsX86.h @@ -23,10 +23,10 @@ template <> struct MantissaWidth { template struct Padding; // i386 padding. -template <> struct Padding<4> { static constexpr unsigned Value = 16; }; +template <> struct Padding<4> { static constexpr unsigned value = 16; }; // x86_64 padding. -template <> struct Padding<8> { static constexpr unsigned Value = 48; }; +template <> struct Padding<8> { static constexpr unsigned value = 48; }; template <> struct __attribute__((packed)) FPBits { using UIntType = __uint128_t; @@ -38,7 +38,7 @@ template <> struct __attribute__((packed)) FPBits { uint8_t implicitBit : 1; uint16_t exponent : ExponentWidth::value; uint8_t sign : 1; - uint64_t padding : Padding::Value; + uint64_t padding : Padding::value; template ::Value, int> = 0> @@ -91,7 +91,15 @@ template <> struct __attribute__((packed)) FPBits { // zero in case i386. UIntType result = UIntType(0); *reinterpret_cast *>(&result) = *this; - return result; + + // Even though we zero out |result| before copying the long double value, + // there can be garbage bits in the padding. So, we zero the padding bits + // in |result|. + static constexpr UIntType mask = + (UIntType(1) << (sizeof(long double) - + Padding::value / 8)) - + 1; + return result & mask; } static FPBits zero() { return FPBits(0.0l); } diff --git a/libc/utils/HdrGen/CMakeLists.txt b/libc/utils/HdrGen/CMakeLists.txt index 65c2d65c79181..8aaaa965acbfe 100644 --- a/libc/utils/HdrGen/CMakeLists.txt +++ b/libc/utils/HdrGen/CMakeLists.txt @@ -12,4 +12,7 @@ add_tablegen(libc-hdrgen llvm-libc PublicAPICommand.h ) +target_include_directories(libc-hdrgen PRIVATE ${LIBC_SOURCE_DIR}) +target_link_libraries(libc-hdrgen PRIVATE LibcTableGenUtil) + add_subdirectory(PrototypeTestGen) diff --git a/libc/utils/HdrGen/PrototypeTestGen/CMakeLists.txt b/libc/utils/HdrGen/PrototypeTestGen/CMakeLists.txt index e4ad5c209cd9e..c90fde76dd58c 100644 --- a/libc/utils/HdrGen/PrototypeTestGen/CMakeLists.txt +++ b/libc/utils/HdrGen/PrototypeTestGen/CMakeLists.txt @@ -1,5 +1,5 @@ add_tablegen(libc-prototype-testgen llvm-libc PrototypeTestGen.cpp - ../PublicAPICommand.cpp - ../Command.cpp ) +target_link_libraries(libc-prototype-testgen PRIVATE LibcTableGenUtil) +target_include_directories(libc-prototype-testgen PRIVATE ${LIBC_SOURCE_DIR}) diff --git a/libc/utils/HdrGen/PrototypeTestGen/PrototypeTestGen.cpp b/libc/utils/HdrGen/PrototypeTestGen/PrototypeTestGen.cpp index aad451a8de225..918b520d3910a 100644 --- a/libc/utils/HdrGen/PrototypeTestGen/PrototypeTestGen.cpp +++ b/libc/utils/HdrGen/PrototypeTestGen/PrototypeTestGen.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "../PublicAPICommand.h" +#include "utils/LibcTableGenUtil/APIIndexer.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/CommandLine.h" diff --git a/libc/utils/HdrGen/PublicAPICommand.cpp b/libc/utils/HdrGen/PublicAPICommand.cpp index 11b825831e883..87b188ca8c02b 100644 --- a/libc/utils/HdrGen/PublicAPICommand.cpp +++ b/libc/utils/HdrGen/PublicAPICommand.cpp @@ -8,32 +8,13 @@ #include "PublicAPICommand.h" +#include "utils/LibcTableGenUtil/APIIndexer.h" + #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" -static const char NamedTypeClassName[] = "NamedType"; -static const char PtrTypeClassName[] = "PtrType"; -static const char RestrictedPtrTypeClassName[] = "RestrictedPtrType"; -static const char ConstTypeClassName[] = "ConstType"; -static const char StructTypeClassName[] = "Struct"; - -static const char StandardSpecClassName[] = "StandardSpec"; -static const char PublicAPIClassName[] = "PublicAPI"; - -static bool isa(llvm::Record *Def, llvm::Record *TypeClass) { - llvm::RecordRecTy *RecordType = Def->getType(); - llvm::ArrayRef Classes = RecordType->getClasses(); - // We want exact types. That is, we don't want the classes listed in - // spec.td to be subclassed. Hence, we do not want the record |Def| - // to be of more than one class type.. - if (Classes.size() != 1) - return false; - return Classes[0] == TypeClass; -} - // Text blocks for macro definitions and type decls can be indented to // suit the surrounding tablegen listing. We need to dedent such blocks // before writing them out. @@ -59,129 +40,6 @@ static void dedentAndWrite(llvm::StringRef Text, llvm::raw_ostream &OS) { namespace llvm_libc { -bool APIIndexer::isaNamedType(llvm::Record *Def) { - return isa(Def, NamedTypeClass); -} - -bool APIIndexer::isaStructType(llvm::Record *Def) { - return isa(Def, StructClass); -} - -bool APIIndexer::isaPtrType(llvm::Record *Def) { - return isa(Def, PtrTypeClass); -} - -bool APIIndexer::isaConstType(llvm::Record *Def) { - return isa(Def, ConstTypeClass); -} - -bool APIIndexer::isaRestrictedPtrType(llvm::Record *Def) { - return isa(Def, RestrictedPtrTypeClass); -} - -bool APIIndexer::isaStandardSpec(llvm::Record *Def) { - return isa(Def, StandardSpecClass); -} - -bool APIIndexer::isaPublicAPI(llvm::Record *Def) { - return isa(Def, PublicAPIClass); -} - -std::string APIIndexer::getTypeAsString(llvm::Record *TypeRecord) { - if (isaNamedType(TypeRecord) || isaStructType(TypeRecord)) { - return std::string(TypeRecord->getValueAsString("Name")); - } else if (isaPtrType(TypeRecord)) { - return getTypeAsString(TypeRecord->getValueAsDef("PointeeType")) + " *"; - } else if (isaConstType(TypeRecord)) { - return std::string("const ") + - getTypeAsString(TypeRecord->getValueAsDef("UnqualifiedType")); - } else if (isaRestrictedPtrType(TypeRecord)) { - return getTypeAsString(TypeRecord->getValueAsDef("PointeeType")) + - " *__restrict"; - } else { - llvm::PrintFatalError(TypeRecord->getLoc(), "Invalid type.\n"); - } -} - -void APIIndexer::indexStandardSpecDef(llvm::Record *StandardSpec) { - auto HeaderSpecList = StandardSpec->getValueAsListOfDefs("Headers"); - for (llvm::Record *HeaderSpec : HeaderSpecList) { - llvm::StringRef Header = HeaderSpec->getValueAsString("Name"); - if (!StdHeader.hasValue() || Header == StdHeader) { - PublicHeaders.emplace(Header); - auto MacroSpecList = HeaderSpec->getValueAsListOfDefs("Macros"); - // TODO: Trigger a fatal error on duplicate specs. - for (llvm::Record *MacroSpec : MacroSpecList) - MacroSpecMap[std::string(MacroSpec->getValueAsString("Name"))] = - MacroSpec; - - auto TypeSpecList = HeaderSpec->getValueAsListOfDefs("Types"); - for (llvm::Record *TypeSpec : TypeSpecList) - TypeSpecMap[std::string(TypeSpec->getValueAsString("Name"))] = TypeSpec; - - auto FunctionSpecList = HeaderSpec->getValueAsListOfDefs("Functions"); - for (llvm::Record *FunctionSpec : FunctionSpecList) { - FunctionSpecMap[std::string(FunctionSpec->getValueAsString("Name"))] = - FunctionSpec; - } - - auto EnumerationSpecList = - HeaderSpec->getValueAsListOfDefs("Enumerations"); - for (llvm::Record *EnumerationSpec : EnumerationSpecList) { - EnumerationSpecMap[std::string( - EnumerationSpec->getValueAsString("Name"))] = EnumerationSpec; - } - } - } -} - -void APIIndexer::indexPublicAPIDef(llvm::Record *PublicAPI) { - // While indexing the public API, we do not check if any of the entities - // requested is from an included standard. Such a check is done while - // generating the API. - auto MacroDefList = PublicAPI->getValueAsListOfDefs("Macros"); - for (llvm::Record *MacroDef : MacroDefList) - MacroDefsMap[std::string(MacroDef->getValueAsString("Name"))] = MacroDef; - - auto TypeDeclList = PublicAPI->getValueAsListOfDefs("TypeDeclarations"); - for (llvm::Record *TypeDecl : TypeDeclList) - TypeDeclsMap[std::string(TypeDecl->getValueAsString("Name"))] = TypeDecl; - - auto StructList = PublicAPI->getValueAsListOfStrings("Structs"); - for (llvm::StringRef StructName : StructList) - Structs.insert(std::string(StructName)); - - auto FunctionList = PublicAPI->getValueAsListOfStrings("Functions"); - for (llvm::StringRef FunctionName : FunctionList) - Functions.insert(std::string(FunctionName)); - - auto EnumerationList = PublicAPI->getValueAsListOfStrings("Enumerations"); - for (llvm::StringRef EnumerationName : EnumerationList) - Enumerations.insert(std::string(EnumerationName)); -} - -void APIIndexer::index(llvm::RecordKeeper &Records) { - NamedTypeClass = Records.getClass(NamedTypeClassName); - PtrTypeClass = Records.getClass(PtrTypeClassName); - RestrictedPtrTypeClass = Records.getClass(RestrictedPtrTypeClassName); - StructClass = Records.getClass(StructTypeClassName); - ConstTypeClass = Records.getClass(ConstTypeClassName); - StandardSpecClass = Records.getClass(StandardSpecClassName); - PublicAPIClass = Records.getClass(PublicAPIClassName); - - const auto &DefsMap = Records.getDefs(); - for (auto &Pair : DefsMap) { - llvm::Record *Def = Pair.second.get(); - if (isaStandardSpec(Def)) - indexStandardSpecDef(Def); - if (isaPublicAPI(Def)) { - if (!StdHeader.hasValue() || - Def->getValueAsString("HeaderName") == StdHeader) - indexPublicAPIDef(Def); - } - } -} - void writeAPIFromIndex(APIIndexer &G, llvm::raw_ostream &OS) { for (auto &Pair : G.MacroDefsMap) { const std::string &Name = Pair.first; diff --git a/libc/utils/HdrGen/PublicAPICommand.h b/libc/utils/HdrGen/PublicAPICommand.h index c95ad525647a7..bfe2e5f5d0075 100644 --- a/libc/utils/HdrGen/PublicAPICommand.h +++ b/libc/utils/HdrGen/PublicAPICommand.h @@ -12,10 +12,8 @@ #include "Command.h" #include "llvm/ADT/StringRef.h" - -#include -#include -#include +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" namespace llvm { @@ -36,61 +34,6 @@ class PublicAPICommand : public Command { const Command::ErrorReporter &Reporter) const override; }; -class APIIndexer { -private: - llvm::Optional StdHeader; - - // TableGen classes in spec.td. - llvm::Record *NamedTypeClass; - llvm::Record *PtrTypeClass; - llvm::Record *RestrictedPtrTypeClass; - llvm::Record *ConstTypeClass; - llvm::Record *StructClass; - llvm::Record *StandardSpecClass; - llvm::Record *PublicAPIClass; - - bool isaNamedType(llvm::Record *Def); - bool isaStructType(llvm::Record *Def); - bool isaPtrType(llvm::Record *Def); - bool isaConstType(llvm::Record *Def); - bool isaRestrictedPtrType(llvm::Record *Def); - bool isaStandardSpec(llvm::Record *Def); - bool isaPublicAPI(llvm::Record *Def); - - void indexStandardSpecDef(llvm::Record *StandardSpec); - void indexPublicAPIDef(llvm::Record *PublicAPI); - void index(llvm::RecordKeeper &Records); - -public: - using NameToRecordMapping = std::unordered_map; - using NameSet = std::unordered_set; - - // This indexes all headers, not just a specified one. - explicit APIIndexer(llvm::RecordKeeper &Records) : StdHeader(llvm::None) { - index(Records); - } - - APIIndexer(llvm::StringRef Header, llvm::RecordKeeper &Records) - : StdHeader(Header) { - index(Records); - } - - // Mapping from names to records defining them. - NameToRecordMapping MacroSpecMap; - NameToRecordMapping TypeSpecMap; - NameToRecordMapping EnumerationSpecMap; - NameToRecordMapping FunctionSpecMap; - NameToRecordMapping MacroDefsMap; - NameToRecordMapping TypeDeclsMap; - - NameSet Structs; - NameSet Enumerations; - NameSet Functions; - NameSet PublicHeaders; - - std::string getTypeAsString(llvm::Record *TypeRecord); -}; - } // namespace llvm_libc #endif // LLVM_LIBC_UTILS_HDRGEN_PUBLICAPICOMMAND_H diff --git a/libc/utils/LibcTableGenUtil/APIIndexer.cpp b/libc/utils/LibcTableGenUtil/APIIndexer.cpp new file mode 100644 index 0000000000000..16aef5880bffe --- /dev/null +++ b/libc/utils/LibcTableGenUtil/APIIndexer.cpp @@ -0,0 +1,162 @@ +//===-- Implementation of APIIndexer class --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "APIIndexer.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" + +namespace llvm_libc { + +static const char NamedTypeClassName[] = "NamedType"; +static const char PtrTypeClassName[] = "PtrType"; +static const char RestrictedPtrTypeClassName[] = "RestrictedPtrType"; +static const char ConstTypeClassName[] = "ConstType"; +static const char StructTypeClassName[] = "Struct"; + +static const char StandardSpecClassName[] = "StandardSpec"; +static const char PublicAPIClassName[] = "PublicAPI"; + +static bool isa(llvm::Record *Def, llvm::Record *TypeClass) { + llvm::RecordRecTy *RecordType = Def->getType(); + llvm::ArrayRef Classes = RecordType->getClasses(); + // We want exact types. That is, we don't want the classes listed in + // spec.td to be subclassed. Hence, we do not want the record |Def| + // to be of more than one class type.. + if (Classes.size() != 1) + return false; + return Classes[0] == TypeClass; +} + +bool APIIndexer::isaNamedType(llvm::Record *Def) { + return isa(Def, NamedTypeClass); +} + +bool APIIndexer::isaStructType(llvm::Record *Def) { + return isa(Def, StructClass); +} + +bool APIIndexer::isaPtrType(llvm::Record *Def) { + return isa(Def, PtrTypeClass); +} + +bool APIIndexer::isaConstType(llvm::Record *Def) { + return isa(Def, ConstTypeClass); +} + +bool APIIndexer::isaRestrictedPtrType(llvm::Record *Def) { + return isa(Def, RestrictedPtrTypeClass); +} + +bool APIIndexer::isaStandardSpec(llvm::Record *Def) { + return isa(Def, StandardSpecClass); +} + +bool APIIndexer::isaPublicAPI(llvm::Record *Def) { + return isa(Def, PublicAPIClass); +} + +std::string APIIndexer::getTypeAsString(llvm::Record *TypeRecord) { + if (isaNamedType(TypeRecord) || isaStructType(TypeRecord)) { + return std::string(TypeRecord->getValueAsString("Name")); + } else if (isaPtrType(TypeRecord)) { + return getTypeAsString(TypeRecord->getValueAsDef("PointeeType")) + " *"; + } else if (isaConstType(TypeRecord)) { + return std::string("const ") + + getTypeAsString(TypeRecord->getValueAsDef("UnqualifiedType")); + } else if (isaRestrictedPtrType(TypeRecord)) { + return getTypeAsString(TypeRecord->getValueAsDef("PointeeType")) + + " *__restrict"; + } else { + llvm::PrintFatalError(TypeRecord->getLoc(), "Invalid type.\n"); + } +} + +void APIIndexer::indexStandardSpecDef(llvm::Record *StandardSpec) { + auto HeaderSpecList = StandardSpec->getValueAsListOfDefs("Headers"); + for (llvm::Record *HeaderSpec : HeaderSpecList) { + llvm::StringRef Header = HeaderSpec->getValueAsString("Name"); + if (!StdHeader.hasValue() || Header == StdHeader) { + PublicHeaders.emplace(Header); + auto MacroSpecList = HeaderSpec->getValueAsListOfDefs("Macros"); + // TODO: Trigger a fatal error on duplicate specs. + for (llvm::Record *MacroSpec : MacroSpecList) + MacroSpecMap[std::string(MacroSpec->getValueAsString("Name"))] = + MacroSpec; + + auto TypeSpecList = HeaderSpec->getValueAsListOfDefs("Types"); + for (llvm::Record *TypeSpec : TypeSpecList) + TypeSpecMap[std::string(TypeSpec->getValueAsString("Name"))] = TypeSpec; + + auto FunctionSpecList = HeaderSpec->getValueAsListOfDefs("Functions"); + for (llvm::Record *FunctionSpec : FunctionSpecList) { + auto FunctionName = std::string(FunctionSpec->getValueAsString("Name")); + FunctionSpecMap[FunctionName] = FunctionSpec; + FunctionToHeaderMap[FunctionName] = std::string(Header); + } + + auto EnumerationSpecList = + HeaderSpec->getValueAsListOfDefs("Enumerations"); + for (llvm::Record *EnumerationSpec : EnumerationSpecList) { + EnumerationSpecMap[std::string( + EnumerationSpec->getValueAsString("Name"))] = EnumerationSpec; + } + } + } +} + +void APIIndexer::indexPublicAPIDef(llvm::Record *PublicAPI) { + // While indexing the public API, we do not check if any of the entities + // requested is from an included standard. Such a check is done while + // generating the API. + auto MacroDefList = PublicAPI->getValueAsListOfDefs("Macros"); + for (llvm::Record *MacroDef : MacroDefList) + MacroDefsMap[std::string(MacroDef->getValueAsString("Name"))] = MacroDef; + + auto TypeDeclList = PublicAPI->getValueAsListOfDefs("TypeDeclarations"); + for (llvm::Record *TypeDecl : TypeDeclList) + TypeDeclsMap[std::string(TypeDecl->getValueAsString("Name"))] = TypeDecl; + + auto StructList = PublicAPI->getValueAsListOfStrings("Structs"); + for (llvm::StringRef StructName : StructList) + Structs.insert(std::string(StructName)); + + auto FunctionList = PublicAPI->getValueAsListOfStrings("Functions"); + for (llvm::StringRef FunctionName : FunctionList) + Functions.insert(std::string(FunctionName)); + + auto EnumerationList = PublicAPI->getValueAsListOfStrings("Enumerations"); + for (llvm::StringRef EnumerationName : EnumerationList) + Enumerations.insert(std::string(EnumerationName)); +} + +void APIIndexer::index(llvm::RecordKeeper &Records) { + NamedTypeClass = Records.getClass(NamedTypeClassName); + PtrTypeClass = Records.getClass(PtrTypeClassName); + RestrictedPtrTypeClass = Records.getClass(RestrictedPtrTypeClassName); + StructClass = Records.getClass(StructTypeClassName); + ConstTypeClass = Records.getClass(ConstTypeClassName); + StandardSpecClass = Records.getClass(StandardSpecClassName); + PublicAPIClass = Records.getClass(PublicAPIClassName); + + const auto &DefsMap = Records.getDefs(); + for (auto &Pair : DefsMap) { + llvm::Record *Def = Pair.second.get(); + if (isaStandardSpec(Def)) + indexStandardSpecDef(Def); + if (isaPublicAPI(Def)) { + if (!StdHeader.hasValue() || + Def->getValueAsString("HeaderName") == StdHeader) + indexPublicAPIDef(Def); + } + } +} + +} // namespace llvm_libc diff --git a/libc/utils/LibcTableGenUtil/APIIndexer.h b/libc/utils/LibcTableGenUtil/APIIndexer.h new file mode 100644 index 0000000000000..7b4d62a38c615 --- /dev/null +++ b/libc/utils/LibcTableGenUtil/APIIndexer.h @@ -0,0 +1,80 @@ +//===-- A class to index libc API listed in tablegen files ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_LIBC_TABLE_GEN_UTILS_API_INDEXER_H +#define LLVM_LIBC_UTILS_LIBC_TABLE_GEN_UTILS_API_INDEXER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/TableGen/Record.h" + +#include +#include +#include + +namespace llvm_libc { + +class APIIndexer { +private: + llvm::Optional StdHeader; + + // TableGen classes in spec.td. + llvm::Record *NamedTypeClass; + llvm::Record *PtrTypeClass; + llvm::Record *RestrictedPtrTypeClass; + llvm::Record *ConstTypeClass; + llvm::Record *StructClass; + llvm::Record *StandardSpecClass; + llvm::Record *PublicAPIClass; + + bool isaNamedType(llvm::Record *Def); + bool isaStructType(llvm::Record *Def); + bool isaPtrType(llvm::Record *Def); + bool isaConstType(llvm::Record *Def); + bool isaRestrictedPtrType(llvm::Record *Def); + bool isaStandardSpec(llvm::Record *Def); + bool isaPublicAPI(llvm::Record *Def); + + void indexStandardSpecDef(llvm::Record *StandardSpec); + void indexPublicAPIDef(llvm::Record *PublicAPI); + void index(llvm::RecordKeeper &Records); + +public: + using NameToRecordMapping = std::unordered_map; + using NameSet = std::unordered_set; + + // This indexes all headers, not just a specified one. + explicit APIIndexer(llvm::RecordKeeper &Records) : StdHeader(llvm::None) { + index(Records); + } + + APIIndexer(llvm::StringRef Header, llvm::RecordKeeper &Records) + : StdHeader(Header) { + index(Records); + } + + // Mapping from names to records defining them. + NameToRecordMapping MacroSpecMap; + NameToRecordMapping TypeSpecMap; + NameToRecordMapping EnumerationSpecMap; + NameToRecordMapping FunctionSpecMap; + NameToRecordMapping MacroDefsMap; + NameToRecordMapping TypeDeclsMap; + + std::unordered_map FunctionToHeaderMap; + + NameSet Structs; + NameSet Enumerations; + NameSet Functions; + NameSet PublicHeaders; + + std::string getTypeAsString(llvm::Record *TypeRecord); +}; + +} // namespace llvm_libc + +#endif // LLVM_LIBC_UTILS_LIBC_TABLE_GEN_UTILS_API_INDEXER_H diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt new file mode 100644 index 0000000000000..ae887a8bdb03a --- /dev/null +++ b/libc/utils/LibcTableGenUtil/CMakeLists.txt @@ -0,0 +1,7 @@ +add_llvm_library( + LibcTableGenUtil + APIIndexer.cpp + APIIndexer.h + LINK_COMPONENTS Support +) +target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR}) diff --git a/libc/utils/UnitTest/CMakeLists.txt b/libc/utils/UnitTest/CMakeLists.txt index 0837f3d8895fc..9c7b48aea4f94 100644 --- a/libc/utils/UnitTest/CMakeLists.txt +++ b/libc/utils/UnitTest/CMakeLists.txt @@ -2,7 +2,6 @@ add_llvm_library( LibcUnitTest Test.cpp Test.h - ErrnoSetterMatcher.h LINK_COMPONENTS Support ) target_include_directories(LibcUnitTest PUBLIC ${LIBC_SOURCE_DIR}) diff --git a/libc/utils/tools/CMakeLists.txt b/libc/utils/tools/CMakeLists.txt new file mode 100644 index 0000000000000..e975b7f293af9 --- /dev/null +++ b/libc/utils/tools/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(WrapperGen) diff --git a/libc/utils/tools/WrapperGen/CMakeLists.txt b/libc/utils/tools/WrapperGen/CMakeLists.txt new file mode 100644 index 0000000000000..fe8ffcce94a58 --- /dev/null +++ b/libc/utils/tools/WrapperGen/CMakeLists.txt @@ -0,0 +1,8 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_tablegen(libc-wrappergen llvm-libc + Main.cpp +) + +target_include_directories(libc-wrappergen PRIVATE ${LIBC_SOURCE_DIR}) +target_link_libraries(libc-wrappergen PRIVATE LibcTableGenUtil) diff --git a/libc/utils/tools/WrapperGen/Main.cpp b/libc/utils/tools/WrapperGen/Main.cpp new file mode 100644 index 0000000000000..ae606d1b66fa1 --- /dev/null +++ b/libc/utils/tools/WrapperGen/Main.cpp @@ -0,0 +1,72 @@ +//===-- "main" function of libc-wrappergen --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "utils/LibcTableGenUtil/APIIndexer.h" + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Main.h" + +#include +#include + +llvm::cl::opt + FunctionName("name", llvm::cl::desc("Name of the function to be wrapped."), + llvm::cl::value_desc(""), llvm::cl::Required); + +static bool WrapperGenMain(llvm::raw_ostream &OS, llvm::RecordKeeper &Records) { + llvm_libc::APIIndexer Indexer(Records); + auto Iter = Indexer.FunctionSpecMap.find(FunctionName); + if (Iter == Indexer.FunctionSpecMap.end()) { + llvm::PrintFatalError("Function '" + FunctionName + + "' not found in any standard spec."); + } + + // To avoid all confusion, we include the implementation header using the + // full path (relative the libc directory.) + std::string Header = Indexer.FunctionToHeaderMap[FunctionName]; + auto RelPath = llvm::StringRef(Header).drop_back(2); // Drop the ".h" suffix. + OS << "#include \"src/" << RelPath << "/" << FunctionName << ".h\"\n"; + + auto &NameSpecPair = *Iter; + llvm::Record *FunctionSpec = NameSpecPair.second; + llvm::Record *RetValSpec = FunctionSpec->getValueAsDef("Return"); + llvm::Record *ReturnType = RetValSpec->getValueAsDef("ReturnType"); + OS << "extern \"C\" " << Indexer.getTypeAsString(ReturnType) << " " + << FunctionName << "("; + + auto ArgsList = FunctionSpec->getValueAsListOfDefs("Args"); + std::stringstream CallArgs; + std::string ArgPrefix("__arg"); + for (size_t i = 0; i < ArgsList.size(); ++i) { + llvm::Record *ArgType = ArgsList[i]->getValueAsDef("ArgType"); + auto TypeName = Indexer.getTypeAsString(ArgType); + OS << TypeName << " " << ArgPrefix << i; + CallArgs << ArgPrefix << i; + if (i < ArgsList.size() - 1) { + OS << ", "; + CallArgs << ", "; + } + } + + // TODO: Arg types of the C++ implementation functions need not + // match the standard types. Either handle such differences here, or + // avoid such a thing in the implementations. + OS << ") {\n" + << " return __llvm_libc::" << FunctionName << "(" << CallArgs.str() + << ");\n" + << "}\n"; + + return false; +} + +int main(int argc, char *argv[]) { + llvm::cl::ParseCommandLineOptions(argc, argv); + return TableGenMain(argv[0], WrapperGenMain); +} diff --git a/libcxx/cmake/Modules/DefineLinkerScript.cmake b/libcxx/cmake/Modules/DefineLinkerScript.cmake index 11a6ca57dfc0d..be7f026af7e87 100644 --- a/libcxx/cmake/Modules/DefineLinkerScript.cmake +++ b/libcxx/cmake/Modules/DefineLinkerScript.cmake @@ -34,13 +34,14 @@ function(define_linker_script target) if ("${lib}" STREQUAL "cxx-headers") continue() endif() - if (TARGET "${lib}" OR - (${lib} MATCHES "cxxabi(_static|_shared)?" AND HAVE_LIBCXXABI) OR - (${lib} MATCHES "unwind(_static|_shared)?" AND HAVE_LIBUNWIND)) - list(APPEND link_libraries "${CMAKE_LINK_LIBRARY_FLAG}$") - else() - list(APPEND link_libraries "${CMAKE_LINK_LIBRARY_FLAG}${lib}") - endif() + # If ${lib} is not a target, we use a dummy target which we know will + # have an OUTPUT_NAME property so that CMake doesn't fail when evaluating + # the non-selected branch of the `IF`. It doesn't matter what it evaluates + # to because it's not selected, but it must not cause an error. + # See https://gitlab.kitware.com/cmake/cmake/-/issues/21045. + set(output_name_tgt "$,${lib},${target}>") + set(libname "$,$,${lib}>") + list(APPEND link_libraries "${CMAKE_LINK_LIBRARY_FLAG}${libname}") endforeach() endif() string(REPLACE ";" " " link_libraries "${link_libraries}") diff --git a/libcxx/include/__config b/libcxx/include/__config index ebdd64ed4292d..8c22323c2babe 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -348,13 +348,11 @@ # if defined(__FreeBSD__) # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT -# define _LIBCPP_HAS_C11_FEATURES # if __FreeBSD_version >= 1300064 || \ (__FreeBSD_version >= 1201504 && __FreeBSD_version < 1300000) # define _LIBCPP_HAS_TIMESPEC_GET # endif # elif defined(__BIONIC__) -# define _LIBCPP_HAS_C11_FEATURES # if __ANDROID_API__ >= 21 # define _LIBCPP_HAS_QUICK_EXIT # endif @@ -368,7 +366,6 @@ # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT # define _LIBCPP_HAS_TIMESPEC_GET -# define _LIBCPP_HAS_C11_FEATURES # elif defined(__linux__) # if !defined(_LIBCPP_HAS_MUSL_LIBC) # if _LIBCPP_GLIBC_PREREQ(2, 15) || defined(__BIONIC__) @@ -376,16 +373,24 @@ # endif # if _LIBCPP_GLIBC_PREREQ(2, 17) # define _LIBCPP_HAS_ALIGNED_ALLOC -# define _LIBCPP_HAS_C11_FEATURES # define _LIBCPP_HAS_TIMESPEC_GET # endif # else // defined(_LIBCPP_HAS_MUSL_LIBC) # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT # define _LIBCPP_HAS_TIMESPEC_GET -# define _LIBCPP_HAS_C11_FEATURES # endif -# endif // __linux__ +# elif defined(__APPLE__) + // timespec_get and aligned_alloc were introduced in macOS 10.15 and + // aligned releases +# if (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= 101500 || \ + __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ >= 60000) +# define _LIBCPP_HAS_ALIGNED_ALLOC +# define _LIBCPP_HAS_TIMESPEC_GET +# endif +# endif // __APPLE__ #endif #ifndef _LIBCPP_CXX03_LANG diff --git a/libcxx/include/functional b/libcxx/include/functional index 3e9425320fc32..9a0ca96c4611b 100644 --- a/libcxx/include/functional +++ b/libcxx/include/functional @@ -508,10 +508,6 @@ POLICY: For non-variadic implementations, the number of arguments is limited #include <__functional_base> -#if defined(_LIBCPP_HAS_BLOCKS_RUNTIME) && !defined(_LIBCPP_HAS_OBJC_ARC) -#include -#endif - #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) #pragma GCC system_header #endif @@ -2257,6 +2253,9 @@ template class __policy_func<_Rp(_ArgTypes...)> #if defined(_LIBCPP_HAS_BLOCKS_RUNTIME) && !defined(_LIBCPP_HAS_OBJC_ARC) +extern "C" void *_Block_copy(const void *); +extern "C" void _Block_release(const void *); + template class __func<_Rp1(^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> @@ -2267,14 +2266,14 @@ class __func<_Rp1(^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> public: _LIBCPP_INLINE_VISIBILITY explicit __func(__block_type const& __f) - : __f_(__f ? Block_copy(__f) : (__block_type)0) + : __f_(reinterpret_cast<__block_type>(__f ? _Block_copy(__f) : nullptr)) { } // [TODO] add && to save on a retain _LIBCPP_INLINE_VISIBILITY explicit __func(__block_type __f, const _Alloc& /* unused */) - : __f_(__f ? Block_copy(__f) : (__block_type)0) + : __f_(reinterpret_cast<__block_type>(__f ? _Block_copy(__f) : nullptr)) { } virtual __base<_Rp(_ArgTypes...)>* __clone() const { @@ -2291,7 +2290,7 @@ public: virtual void destroy() _NOEXCEPT { if (__f_) - Block_release(__f_); + _Block_release(__f_); __f_ = 0; } diff --git a/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp b/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp new file mode 100644 index 0000000000000..d1b41de5dc1a9 --- /dev/null +++ b/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Make sure TEST_HAS_ALIGNED_ALLOC (defined by the test suite) and +// _LIBCPP_HAS_ALIGNED_ALLOC (defined by libc++) stay in sync. + +#include <__config> +#include "test_macros.h" + +#if defined(TEST_HAS_ALIGNED_ALLOC) != defined(_LIBCPP_HAS_ALIGNED_ALLOC) +# error "TEST_HAS_ALIGNED_ALLOC and _LIBCPP_HAS_ALIGNED_ALLOC are out of sync" +#endif diff --git a/libcxx/test/libcxx/language.support/has_c11_features.pass.cpp b/libcxx/test/libcxx/language.support/has_c11_features.pass.cpp deleted file mode 100644 index ab1e6ea619252..0000000000000 --- a/libcxx/test/libcxx/language.support/has_c11_features.pass.cpp +++ /dev/null @@ -1,33 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11, c++14 - -// We have two macros for checking whether or not the underlying C library -// has C11 features: -// TEST_HAS_C11_FEATURES - which is defined in "test_macros.h" -// _LIBCPP_HAS_C11_FEATURES - which is defined in <__config> -// They should always be the same - -#include <__config> -#include "test_macros.h" - -#ifdef TEST_HAS_C11_FEATURES -# ifndef _LIBCPP_HAS_C11_FEATURES -# error "TEST_HAS_C11_FEATURES is defined, but _LIBCPP_HAS_C11_FEATURES is not" -# endif -#endif - -#ifdef _LIBCPP_HAS_C11_FEATURES -# ifndef TEST_HAS_C11_FEATURES -# error "_LIBCPP_HAS_C11_FEATURES is defined, but TEST_HAS_C11_FEATURES is not" -# endif -#endif - -int main(int, char**) { - return 0; -} diff --git a/libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp b/libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp new file mode 100644 index 0000000000000..ad0fe6fb18fd1 --- /dev/null +++ b/libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Make sure TEST_HAS_QUICK_EXIT (defined by the test suite) and +// _LIBCPP_HAS_QUICK_EXIT (defined by libc++) stay in sync. + +#include <__config> +#include "test_macros.h" + +#if defined(TEST_HAS_QUICK_EXIT) != defined(_LIBCPP_HAS_QUICK_EXIT) +# error "TEST_HAS_QUICK_EXIT and _LIBCPP_HAS_QUICK_EXIT are out of sync" +#endif diff --git a/libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp b/libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp new file mode 100644 index 0000000000000..8b86a5ef97195 --- /dev/null +++ b/libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Make sure TEST_HAS_TIMESPEC_GET (defined by the test suite) and +// _LIBCPP_HAS_TIMESPEC_GET (defined by libc++) stay in sync. + +#include <__config> +#include "test_macros.h" + +#if defined(TEST_HAS_TIMESPEC_GET) != defined(_LIBCPP_HAS_TIMESPEC_GET) +# error "TEST_HAS_TIMESPEC_GET and _LIBCPP_HAS_TIMESPEC_GET are out of sync" +#endif diff --git a/libcxx/test/libcxx/utilities/function.objects/func.blocks.sh.cpp b/libcxx/test/libcxx/utilities/function.objects/func.blocks.sh.cpp index 33c11651f12f5..9a8e9389426a4 100644 --- a/libcxx/test/libcxx/utilities/function.objects/func.blocks.sh.cpp +++ b/libcxx/test/libcxx/utilities/function.objects/func.blocks.sh.cpp @@ -21,6 +21,8 @@ #include #include +#include + #include "test_macros.h" #include "count_new.h" diff --git a/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp index 779fbc66f5d02..ae061ed851590 100644 --- a/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp +++ b/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// - // test +// Before Clang 9.0, does not define FLT_HAS_SUBNORM & friends in C++. +// XFAIL: clang-4, clang-5, clang-6, clang-7, clang-8 + +// test #include @@ -24,7 +27,7 @@ #error FLT_RADIX not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 #ifndef FLT_HAS_SUBNORM #error FLT_HAS_SUBNORM not defined #endif @@ -54,7 +57,7 @@ #error DECIMAL_DIG not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 #ifndef FLT_DECIMAL_DIG #error FLT_DECIMAL_DIG not defined #endif @@ -164,7 +167,7 @@ #error LDBL_MIN not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 #ifndef FLT_TRUE_MIN #error FLT_TRUE_MIN not defined #endif diff --git a/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp b/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp index 12b80adf01169..1406908b52464 100644 --- a/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// - // test cfloat +// Before Clang 9.0, does not define FLT_HAS_SUBNORM & friends in C++. +// XFAIL: clang-4, clang-5, clang-6, clang-7, clang-8 + +// test cfloat #include @@ -24,7 +27,7 @@ #error FLT_RADIX not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 #ifndef FLT_HAS_SUBNORM #error FLT_HAS_SUBNORM not defined #endif @@ -54,7 +57,7 @@ #error DECIMAL_DIG not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 #ifndef FLT_DECIMAL_DIG #error FLT_DECIMAL_DIG not defined #endif @@ -164,7 +167,7 @@ #error LDBL_MIN not defined #endif -#if TEST_STD_VER > 14 && defined(TEST_HAS_C11_FEATURES) && 0 +#if TEST_STD_VER > 14 #ifndef FLT_TRUE_MIN #error FLT_TRUE_MIN not defined #endif diff --git a/libcxx/test/std/language.support/support.start.term/quick_exit_check1.compile.fail.cpp b/libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp similarity index 70% rename from libcxx/test/std/language.support/support.start.term/quick_exit_check1.compile.fail.cpp rename to libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp index 10bd5767de0e8..79051fb5167d4 100644 --- a/libcxx/test/std/language.support/support.start.term/quick_exit_check1.compile.fail.cpp +++ b/libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp @@ -5,23 +5,23 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// + // UNSUPPORTED: c++03 -// test that referencing at_quick_exit when _LIBCPP_HAS_QUICK_EXIT is not defined +// test that referencing at_quick_exit when TEST_HAS_QUICK_EXIT is not defined // results in a compile error. #include -void f() {} +#include "test_macros.h" -int main(int, char**) -{ -#ifndef _LIBCPP_HAS_QUICK_EXIT +void f() { } + +int main(int, char**) { +#if !defined(TEST_HAS_QUICK_EXIT) std::at_quick_exit(f); #else -#error +# error #endif - - return 0; + return 0; } diff --git a/libcxx/test/std/language.support/support.start.term/quick_exit_check2.compile.fail.cpp b/libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp similarity index 72% rename from libcxx/test/std/language.support/support.start.term/quick_exit_check2.compile.fail.cpp rename to libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp index cb0dbd35c5575..f9c5bdc835694 100644 --- a/libcxx/test/std/language.support/support.start.term/quick_exit_check2.compile.fail.cpp +++ b/libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp @@ -5,22 +5,21 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// + // UNSUPPORTED: c++03 -// test that referencing quick_exit when _LIBCPP_HAS_QUICK_EXIT is not defined +// test that referencing quick_exit when TEST_HAS_QUICK_EXIT is not defined // results in a compile error. #include -void f() {} +#include "test_macros.h" -int main(int, char**) -{ -#ifndef _LIBCPP_HAS_QUICK_EXIT +int main(int, char**) { +#if !defined(TEST_HAS_QUICK_EXIT) std::quick_exit(0); #else -#error +# error #endif - - return 0; + return 0; } diff --git a/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp b/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp index b45b3391ba5c3..16f68d435640c 100644 --- a/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp +++ b/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp @@ -17,7 +17,7 @@ void f() {} int main(int, char**) { -#ifdef _LIBCPP_HAS_QUICK_EXIT +#ifdef TEST_HAS_QUICK_EXIT std::at_quick_exit(f); std::quick_exit(0); #endif diff --git a/libcxx/test/std/numerics/numbers/defined.pass.cpp b/libcxx/test/std/numerics/numbers/defined.pass.cpp index cc8bf179a74e3..ef432b8585e40 100644 --- a/libcxx/test/std/numerics/numbers/defined.pass.cpp +++ b/libcxx/test/std/numerics/numbers/defined.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++98, c++03, c++11, c++14, c++17 +// UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: libcpp-no-concepts #include diff --git a/libcxx/test/std/numerics/numbers/illformed.verify.cpp b/libcxx/test/std/numerics/numbers/illformed.verify.cpp index 5e35181f8fd94..d0b21bfa79a59 100644 --- a/libcxx/test/std/numerics/numbers/illformed.verify.cpp +++ b/libcxx/test/std/numerics/numbers/illformed.verify.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++98, c++03, c++11, c++14, c++17 +// UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: libcpp-no-concepts #include diff --git a/libcxx/test/std/numerics/numbers/specialize.pass.cpp b/libcxx/test/std/numerics/numbers/specialize.pass.cpp index 380892cca4eae..df31c8afb4baf 100644 --- a/libcxx/test/std/numerics/numbers/specialize.pass.cpp +++ b/libcxx/test/std/numerics/numbers/specialize.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++98, c++03, c++11, c++14, c++17 +// UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: libcpp-no-concepts #include diff --git a/libcxx/test/std/numerics/numbers/user_type.pass.cpp b/libcxx/test/std/numerics/numbers/user_type.pass.cpp index d4152c9ff74f6..fa547cf05957e 100644 --- a/libcxx/test/std/numerics/numbers/user_type.pass.cpp +++ b/libcxx/test/std/numerics/numbers/user_type.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++98, c++03, c++11, c++14, c++17 +// UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: libcpp-no-concepts #include diff --git a/libcxx/test/std/numerics/numbers/value.pass.cpp b/libcxx/test/std/numerics/numbers/value.pass.cpp index ff6a05ae2de30..9f623e55abaa8 100644 --- a/libcxx/test/std/numerics/numbers/value.pass.cpp +++ b/libcxx/test/std/numerics/numbers/value.pass.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++98, c++03, c++11, c++14, c++17 +// UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: libcpp-no-concepts #include diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp index bb41fa89be372..e736eb57e0569 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp @@ -28,6 +28,9 @@ struct A int A::count = 0; +struct Base { }; +struct Derived : Base { }; + int main(int, char**) { { @@ -46,5 +49,11 @@ int main(int, char**) assert(test_deleter::count == 0); assert(test_deleter::dealloc_count == 1); + { + // Make sure that we can construct a shared_ptr where the element type and pointer type + // aren't "convertible" but are "compatible". + static_assert(!std::is_constructible, Base[4], test_deleter >::value, ""); + } + return 0; } diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp index e17ae6f34312d..af9bfe9ab057b 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp @@ -28,6 +28,8 @@ struct A int A::count = 0; +struct Base { }; +struct Derived : Base { }; int main(int, char**) { @@ -87,5 +89,13 @@ int main(int, char**) assert(test_deleter::dealloc_count == 1); #endif + { + // Make sure that we can construct a shared_ptr where the element type and pointer type + // aren't "convertible" but are "compatible". + static_assert(!std::is_constructible, + Base[4], test_deleter, + test_allocator >::value, ""); + } + return 0; } diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h index cf8696af5588a..0869be349e23d 100644 --- a/libcxx/test/support/msvc_stdlib_force_include.h +++ b/libcxx/test/support/msvc_stdlib_force_include.h @@ -65,9 +65,6 @@ const AssertionDialogAvoider assertion_dialog_avoider{}; // MSVC doesn't have __int128_t. #define _LIBCPP_HAS_NO_INT128 -// MSVC has quick_exit() and at_quick_exit(). -#define _LIBCPP_HAS_QUICK_EXIT - #ifndef _LIBCXX_IN_DEVCRT // atomic_is_lock_free.pass.cpp needs this VS 2015 Update 2 fix. #define _ENABLE_ATOMIC_ALIGNMENT_FIX diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 63aa4f47a70f0..00873607fb11c 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -162,16 +162,17 @@ #endif // Sniff out to see if the underlying C library has C11 features -// Note that at this time (July 2018), MacOS X and iOS do NOT. // This is cribbed from __config; but lives here as well because we can't assume libc++ #if __ISO_C_VISIBLE >= 2011 || TEST_STD_VER >= 11 # if defined(__FreeBSD__) // Specifically, FreeBSD does NOT have timespec_get, even though they have all // the rest of C11 - this is PR#38495 # define TEST_HAS_ALIGNED_ALLOC -# define TEST_HAS_C11_FEATURES +# define TEST_HAS_QUICK_EXIT # elif defined(__BIONIC__) -# define TEST_HAS_C11_FEATURES +# if __ANDROID_API__ >= 21 +# define TEST_HAS_QUICK_EXIT +# endif # if __ANDROID_API__ >= 28 # define TEST_HAS_ALIGNED_ALLOC # endif @@ -179,8 +180,8 @@ # define TEST_HAS_TIMESPEC_GET # endif # elif defined(__Fuchsia__) || defined(__wasi__) || defined(__NetBSD__) +# define TEST_HAS_QUICK_EXIT # define TEST_HAS_ALIGNED_ALLOC -# define TEST_HAS_C11_FEATURES # define TEST_HAS_TIMESPEC_GET # elif defined(__linux__) // This block preserves the old behavior used by include/__config: @@ -188,23 +189,35 @@ // available. The configuration here may be too vague though, as Bionic, uClibc, // newlib, etc may all support these features but need to be configured. # if defined(TEST_GLIBC_PREREQ) +# if TEST_GLIBC_PREREQ(2, 15) +# define TEST_HAS_QUICK_EXIT +# endif # if TEST_GLIBC_PREREQ(2, 17) # define TEST_HAS_ALIGNED_ALLOC # define TEST_HAS_TIMESPEC_GET -# define TEST_HAS_C11_FEATURES # endif # elif defined(_LIBCPP_HAS_MUSL_LIBC) +# define TEST_HAS_QUICK_EXIT # define TEST_HAS_ALIGNED_ALLOC -# define TEST_HAS_C11_FEATURES # define TEST_HAS_TIMESPEC_GET # endif # elif defined(_WIN32) # if defined(_MSC_VER) && !defined(__MINGW32__) +# define TEST_HAS_QUICK_EXIT +# define TEST_HAS_ALIGNED_ALLOC +# define TEST_HAS_TIMESPEC_GET +# endif +# elif defined(__APPLE__) + // timespec_get and aligned_alloc were introduced in macOS 10.15 and + // aligned releases +# if (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= 101500 || \ + __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ >= 60000) # define TEST_HAS_ALIGNED_ALLOC -# define TEST_HAS_C11_FEATURES // Using Microsoft's C Runtime library # define TEST_HAS_TIMESPEC_GET # endif -# endif +# endif // __APPLE__ #endif /* Features that were introduced in C++14 */ diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py index cd500e132946e..95d23df7ece13 100644 --- a/libcxx/utils/libcxx/test/dsl.py +++ b/libcxx/utils/libcxx/test/dsl.py @@ -52,7 +52,7 @@ def _executeScriptInternal(test, commands): res = ('', '', 127, None) return res -def _makeConfigTest(config, testPrefix=None): +def _makeConfigTest(config, testPrefix=''): sourceRoot = os.path.join(config.test_exec_root, '__config_src__') execRoot = os.path.join(config.test_exec_root, '__config_exec__') suite = lit.Test.TestSuite('__config__', sourceRoot, execRoot, config) @@ -83,7 +83,7 @@ def sourceBuilds(config, source): _executeScriptInternal(test, ['rm %t.exe']) return exitCode == 0 -def programOutput(config, program, args=[], testPrefix=None): +def programOutput(config, program, args=[], testPrefix=''): """ Compiles a program for the test target, run it on the test target and return the output. diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py index a9431ec073f8d..773890b015097 100644 --- a/libcxx/utils/libcxx/test/params.py +++ b/libcxx/utils/libcxx/test/params.py @@ -8,7 +8,7 @@ from libcxx.test.dsl import * -_allStandards = ['c++98', 'c++03', 'c++11', 'c++14', 'c++17', 'c++2a'] +_allStandards = ['c++03', 'c++11', 'c++14', 'c++17', 'c++2a'] DEFAULT_PARAMETERS = [ # Core parameters of the test suite diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt index c5b536532f3c2..8419d851ab7f4 100644 --- a/libunwind/CMakeLists.txt +++ b/libunwind/CMakeLists.txt @@ -137,6 +137,7 @@ option(LIBUNWIND_ENABLE_THREADS "Build libunwind with threading support." ON) option(LIBUNWIND_WEAK_PTHREAD_LIB "Use weak references to refer to pthread functions." OFF) option(LIBUNWIND_USE_COMPILER_RT "Use compiler-rt instead of libgcc" OFF) option(LIBUNWIND_INCLUDE_DOCS "Build the libunwind documentation." ${LLVM_INCLUDE_DOCS}) +option(LIBUNWIND_IS_BAREMETAL "Build libunwind for baremetal targets." OFF) set(LIBUNWIND_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}" CACHE STRING "Define suffix of library directory name (32/64)") @@ -270,6 +271,8 @@ add_compile_flags_if_supported(-Wunused-variable) add_compile_flags_if_supported(-Wwrite-strings) add_compile_flags_if_supported(-Wundef) +add_compile_flags_if_supported(-Wno-suggest-override) + if (LIBUNWIND_ENABLE_WERROR) add_compile_flags_if_supported(-Werror) add_compile_flags_if_supported(-WX) @@ -361,6 +364,10 @@ if (LIBUNWIND_ENABLE_ARM_WMMX) add_compile_flags(-D__ARM_WMMX) endif() +if(LIBUNWIND_IS_BAREMETAL) + add_compile_definitions(_LIBUNWIND_IS_BAREMETAL) +endif() + # This is the _ONLY_ place where add_definitions is called. if (MSVC) add_definitions(-D_CRT_SECURE_NO_WARNINGS) diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp index 764aaa3489f26..3d1e810f43c08 100644 --- a/libunwind/src/AddressSpace.hpp +++ b/libunwind/src/AddressSpace.hpp @@ -39,13 +39,6 @@ struct EHABIIndexEntry { }; #endif -#ifdef __APPLE__ -#include -namespace libunwind { - bool checkKeyMgrRegisteredFDEs(uintptr_t targetAddr, void *&fde); -} -#endif - #include "libunwind.h" #include "config.h" #include "dwarf2.h" @@ -651,14 +644,10 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, inline bool LocalAddressSpace::findOtherFDE(pint_t targetAddr, pint_t &fde) { -#ifdef __APPLE__ - return checkKeyMgrRegisteredFDEs(targetAddr, *((void**)&fde)); -#else // TO DO: if OS has way to dynamically register FDEs, check that. (void)targetAddr; (void)fde; return false; -#endif } inline bool LocalAddressSpace::findFunctionName(pint_t addr, char *buf, diff --git a/libunwind/src/Unwind_AppleExtras.cpp b/libunwind/src/Unwind_AppleExtras.cpp index 536303993effb..1d9948aced355 100644 --- a/libunwind/src/Unwind_AppleExtras.cpp +++ b/libunwind/src/Unwind_AppleExtras.cpp @@ -12,33 +12,6 @@ #include "DwarfParser.hpp" -// private keymgr stuff -#define KEYMGR_GCC3_DW2_OBJ_LIST 302 -extern "C" { - extern void _keymgr_set_and_unlock_processwide_ptr(int key, void *ptr); - extern void *_keymgr_get_and_lock_processwide_ptr(int key); -} - -// undocumented libgcc "struct object" -struct libgcc_object { - void *start; - void *unused1; - void *unused2; - void *fde; - unsigned long encoding; - void *fde_end; - libgcc_object *next; -}; - -// undocumented libgcc "struct km_object_info" referenced by -// KEYMGR_GCC3_DW2_OBJ_LIST -struct libgcc_object_info { - libgcc_object *seen_objects; - libgcc_object *unseen_objects; - unsigned spare[2]; -}; - - // static linker symbols to prevent wrong two level namespace for _Unwind symbols #if defined(__arm__) #define NOT_HERE_BEFORE_5_0(sym) \ @@ -140,44 +113,3 @@ NOT_HERE_BEFORE_5_0(_Unwind_SjLj_Resume_or_Rethrow) NOT_HERE_BEFORE_5_0(_Unwind_SjLj_Unregister) #endif // defined(_LIBUNWIND_BUILD_SJLJ_APIS) - - -namespace libunwind { - -_LIBUNWIND_HIDDEN -bool checkKeyMgrRegisteredFDEs(uintptr_t pc, void *&fde) { -#if __MAC_OS_X_VERSION_MIN_REQUIRED - // lastly check for old style keymgr registration of dynamically generated - // FDEs acquire exclusive access to libgcc_object_info - libgcc_object_info *head = (libgcc_object_info *) - _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST); - if (head != NULL) { - // look at each FDE in keymgr - for (libgcc_object *ob = head->unseen_objects; ob != NULL; ob = ob->next) { - CFI_Parser::FDE_Info fdeInfo; - CFI_Parser::CIE_Info cieInfo; - const char *msg = CFI_Parser::decodeFDE( - LocalAddressSpace::sThisAddressSpace, - (uintptr_t)ob->fde, &fdeInfo, &cieInfo); - if (msg == NULL) { - // Check if this FDE is for a function that includes the pc - if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) { - fde = (void*)fdeInfo.pcStart; - _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, - head); - return true; - } - } - } - } - // release libgcc_object_info - _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, head); -#else - (void)pc; - (void)fde; -#endif - return false; -} - -} - diff --git a/libunwind/test/CMakeLists.txt b/libunwind/test/CMakeLists.txt index 794a59f58f84a..2b945e6eff762 100644 --- a/libunwind/test/CMakeLists.txt +++ b/libunwind/test/CMakeLists.txt @@ -32,4 +32,4 @@ configure_lit_site_cfg( add_lit_testsuite(check-unwind "Running libunwind tests" ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${LIBUNWIND_TEST_DEPS}) + DEPENDS unwind ${LIBUNWIND_TEST_DEPS}) diff --git a/lld/CMakeLists.txt b/lld/CMakeLists.txt index bcfc2c6270b3f..e9bd1bd29c5cf 100644 --- a/lld/CMakeLists.txt +++ b/lld/CMakeLists.txt @@ -51,10 +51,6 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin) find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) - if(LLVM_ENABLE_ZLIB) - find_package(ZLIB) - endif() - include(AddLLVM) include(TableGen) include(HandleLLVMOptions) diff --git a/lld/COFF/CMakeLists.txt b/lld/COFF/CMakeLists.txt index 796f7a82a3de1..bbcd337b210f3 100644 --- a/lld/COFF/CMakeLists.txt +++ b/lld/COFF/CMakeLists.txt @@ -3,6 +3,7 @@ tablegen(LLVM Options.inc -gen-opt-parser-defs) add_public_tablegen_target(COFFOptionsTableGen) add_lld_library(lldCOFF + CallGraphSort.cpp Chunks.cpp DebugTypes.cpp DLL.cpp diff --git a/lld/COFF/CallGraphSort.cpp b/lld/COFF/CallGraphSort.cpp new file mode 100644 index 0000000000000..d3e5312ce7fdd --- /dev/null +++ b/lld/COFF/CallGraphSort.cpp @@ -0,0 +1,245 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// This is based on the ELF port, see ELF/CallGraphSort.cpp for the details +/// about the algorithm. +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "InputFiles.h" +#include "SymbolTable.h" +#include "Symbols.h" +#include "lld/Common/ErrorHandler.h" + +#include + +using namespace llvm; +using namespace lld; +using namespace lld::coff; + +namespace { +struct Edge { + int from; + uint64_t weight; +}; + +struct Cluster { + Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {} + + double getDensity() const { + if (size == 0) + return 0; + return double(weight) / double(size); + } + + int next; + int prev; + uint64_t size; + uint64_t weight = 0; + uint64_t initialWeight = 0; + Edge bestPred = {-1, 0}; +}; + +class CallGraphSort { +public: + CallGraphSort(); + + DenseMap run(); + +private: + std::vector clusters; + std::vector sections; +}; + +// Maximum amount the combined cluster density can be worse than the original +// cluster to consider merging. +constexpr int MAX_DENSITY_DEGRADATION = 8; + +// Maximum cluster size in bytes. +constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; +} // end anonymous namespace + +using SectionPair = std::pair; + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort() { + MapVector &profile = config->callGraphProfile; + DenseMap secToCluster; + + auto getOrCreateNode = [&](const SectionChunk *isec) -> int { + auto res = secToCluster.try_emplace(isec, clusters.size()); + if (res.second) { + sections.push_back(isec); + clusters.emplace_back(clusters.size(), isec->getSize()); + } + return res.first->second; + }; + + // Create the graph. + for (std::pair &c : profile) { + const auto *fromSec = cast(c.first.first->repl); + const auto *toSec = cast(c.first.second->repl); + uint64_t weight = c.second; + + // Ignore edges between input sections belonging to different output + // sections. This is done because otherwise we would end up with clusters + // containing input sections that can't actually be placed adjacently in the + // output. This messes with the cluster size and density calculations. We + // would also end up moving input sections in other output sections without + // moving them closer to what calls them. + if (fromSec->getOutputSection() != toSec->getOutputSection()) + continue; + + int from = getOrCreateNode(fromSec); + int to = getOrCreateNode(toSec); + + clusters[to].weight += weight; + + if (from == to) + continue; + + // Remember the best edge. + Cluster &toC = clusters[to]; + if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) { + toC.bestPred.from = from; + toC.bestPred.weight = weight; + } + } + for (Cluster &c : clusters) + c.initialWeight = c.weight; +} + +// It's bad to merge clusters which would degrade the density too much. +static bool isNewDensityBad(Cluster &a, Cluster &b) { + double newDensity = double(a.weight + b.weight) / double(a.size + b.size); + return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION; +} + +// Find the leader of V's belonged cluster (represented as an equivalence +// class). We apply union-find path-halving technique (simple to implement) in +// the meantime as it decreases depths and the time complexity. +static int getLeader(std::vector &leaders, int v) { + while (leaders[v] != v) { + leaders[v] = leaders[leaders[v]]; + v = leaders[v]; + } + return v; +} + +static void mergeClusters(std::vector &cs, Cluster &into, int intoIdx, + Cluster &from, int fromIdx) { + int tail1 = into.prev, tail2 = from.prev; + into.prev = tail2; + cs[tail2].next = intoIdx; + from.prev = tail1; + cs[tail1].next = fromIdx; + into.size += from.size; + into.weight += from.weight; + from.size = 0; + from.weight = 0; +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +DenseMap CallGraphSort::run() { + std::vector sorted(clusters.size()); + std::vector leaders(clusters.size()); + + std::iota(leaders.begin(), leaders.end(), 0); + std::iota(sorted.begin(), sorted.end(), 0); + llvm::stable_sort(sorted, [&](int a, int b) { + return clusters[a].getDensity() > clusters[b].getDensity(); + }); + + for (int l : sorted) { + // The cluster index is the same as the index of its leader here because + // clusters[L] has not been merged into another cluster yet. + Cluster &c = clusters[l]; + + // Don't consider merging if the edge is unlikely. + if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight) + continue; + + int predL = getLeader(leaders, c.bestPred.from); + if (l == predL) + continue; + + Cluster *predC = &clusters[predL]; + if (c.size + predC->size > MAX_CLUSTER_SIZE) + continue; + + if (isNewDensityBad(*predC, c)) + continue; + + leaders[l] = predL; + mergeClusters(clusters, *predC, predL, c, l); + } + + // Sort remaining non-empty clusters by density. + sorted.clear(); + for (int i = 0, e = (int)clusters.size(); i != e; ++i) + if (clusters[i].size > 0) + sorted.push_back(i); + llvm::stable_sort(sorted, [&](int a, int b) { + return clusters[a].getDensity() > clusters[b].getDensity(); + }); + + DenseMap orderMap; + // Sections will be sorted by increasing order. Absent sections will have + // priority 0 and be placed at the end of sections. + int curOrder = INT_MIN; + for (int leader : sorted) { + for (int i = leader;;) { + orderMap[sections[i]] = curOrder++; + i = clusters[i].next; + if (i == leader) + break; + } + } + if (!config->printSymbolOrder.empty()) { + std::error_code ec; + raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None); + if (ec) { + error("cannot open " + config->printSymbolOrder + ": " + ec.message()); + return orderMap; + } + // Print the symbols ordered by C3, in the order of increasing curOrder + // Instead of sorting all the orderMap, just repeat the loops above. + for (int leader : sorted) + for (int i = leader;;) { + const SectionChunk *sc = sections[i]; + + // Search all the symbols in the file of the section + // and find out a DefinedCOFF symbol with name that is within the + // section. + for (Symbol *sym : sc->file->getSymbols()) + if (auto *d = dyn_cast_or_null(sym)) + // Filter out non-COMDAT symbols and section symbols. + if (d->isCOMDAT && !d->getCOFFSymbol().isSection() && + sc == d->getChunk()) + os << sym->getName() << "\n"; + i = clusters[i].next; + if (i == leader) + break; + } + } + + return orderMap; +} + +// Sort sections by the profile data provided by /call-graph-ordering-file +// +// This first builds a call graph based on the profile data then merges sections +// according to the C³ heuristic. All clusters are then sorted by a density +// metric to further improve locality. +DenseMap coff::computeCallGraphProfileOrder() { + return CallGraphSort().run(); +} diff --git a/lld/COFF/CallGraphSort.h b/lld/COFF/CallGraphSort.h new file mode 100644 index 0000000000000..e4f372137448c --- /dev/null +++ b/lld/COFF/CallGraphSort.h @@ -0,0 +1,22 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_COFF_CALL_GRAPH_SORT_H +#define LLD_COFF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace coff { +class SectionChunk; + +llvm::DenseMap computeCallGraphProfileOrder(); +} // namespace coff +} // namespace lld + +#endif diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h index 72d826b8bd173..286b67b437a89 100644 --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -9,6 +9,7 @@ #ifndef LLD_COFF_CONFIG_H #define LLD_COFF_CONFIG_H +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/COFF.h" @@ -29,6 +30,7 @@ class DefinedRelative; class StringChunk; class Symbol; class InputFile; +class SectionChunk; // Short aliases. static const auto AMD64 = llvm::COFF::IMAGE_FILE_MACHINE_AMD64; @@ -140,6 +142,7 @@ struct Configuration { bool safeSEH = false; Symbol *sehTable = nullptr; Symbol *sehCount = nullptr; + bool noSEH = false; // Used for /opt:lldlto=N unsigned ltoo = 2; @@ -200,6 +203,15 @@ struct Configuration { // Used for /lto-obj-path: llvm::StringRef ltoObjPath; + // Used for /call-graph-ordering-file: + llvm::MapVector, + uint64_t> + callGraphProfile; + bool callGraphProfileSort = false; + + // Used for /print-symbol-order: + StringRef printSymbolOrder; + uint64_t align = 4096; uint64_t imageBase = -1; uint64_t fileAlign = 512; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 7372505bb6161..55e97d50c2263 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -34,6 +34,7 @@ #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/Option.h" +#include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/LEB128.h" @@ -924,6 +925,75 @@ static void parseOrderFile(StringRef arg) { } } +static void parseCallGraphFile(StringRef path) { + std::unique_ptr mb = CHECK( + MemoryBuffer::getFile(path, -1, false, true), "could not open " + path); + + // Build a map from symbol name to section. + DenseMap map; + for (ObjFile *file : ObjFile::instances) + for (Symbol *sym : file->getSymbols()) + if (sym) + map[sym->getName()] = sym; + + auto findSection = [&](StringRef name) -> SectionChunk * { + Symbol *sym = map.lookup(name); + if (!sym) { + if (config->warnMissingOrderSymbol) + warn(path + ": no such symbol: " + name); + return nullptr; + } + + if (DefinedCOFF *dr = dyn_cast_or_null(sym)) + return dyn_cast_or_null(dr->getChunk()); + return nullptr; + }; + + for (StringRef line : args::getLines(*mb)) { + SmallVector fields; + line.split(fields, ' '); + uint64_t count; + + if (fields.size() != 3 || !to_integer(fields[2], count)) { + error(path + ": parse error"); + return; + } + + if (SectionChunk *from = findSection(fields[0])) + if (SectionChunk *to = findSection(fields[1])) + config->callGraphProfile[{from, to}] += count; + } +} + +static void readCallGraphsFromObjectFiles() { + for (ObjFile *obj : ObjFile::instances) { + if (obj->callgraphSec) { + ArrayRef contents; + cantFail( + obj->getCOFFObj()->getSectionContents(obj->callgraphSec, contents)); + BinaryStreamReader reader(contents, support::little); + while (!reader.empty()) { + uint32_t fromIndex, toIndex; + uint64_t count; + if (Error err = reader.readInteger(fromIndex)) + fatal(toString(obj) + ": Expected 32-bit integer"); + if (Error err = reader.readInteger(toIndex)) + fatal(toString(obj) + ": Expected 32-bit integer"); + if (Error err = reader.readInteger(count)) + fatal(toString(obj) + ": Expected 64-bit integer"); + auto *fromSym = dyn_cast_or_null(obj->getSymbol(fromIndex)); + auto *toSym = dyn_cast_or_null(obj->getSymbol(toIndex)); + if (!fromSym || !toSym) + continue; + auto *from = dyn_cast_or_null(fromSym->getChunk()); + auto *to = dyn_cast_or_null(toSym->getChunk()); + if (from && to) + config->callGraphProfile[{from, to}] += count; + } + } + } +} + static void markAddrsig(Symbol *s) { if (auto *d = dyn_cast_or_null(s)) if (SectionChunk *c = dyn_cast_or_null(d->getChunk())) @@ -1587,9 +1657,11 @@ void LinkerDriver::link(ArrayRef argsArr) { args.hasFlag(OPT_auto_import, OPT_auto_import_no, config->mingw); config->pseudoRelocs = args.hasFlag( OPT_runtime_pseudo_reloc, OPT_runtime_pseudo_reloc_no, config->mingw); + config->callGraphProfileSort = args.hasFlag( + OPT_call_graph_profile_sort, OPT_call_graph_profile_sort_no, true); - // Don't warn about long section names, such as .debug_info, for mingw or when - // -debug:dwarf is requested. + // Don't warn about long section names, such as .debug_info, for mingw or + // when -debug:dwarf is requested. if (config->mingw || config->debugDwarf) config->warnLongSectionNames = false; @@ -1700,9 +1772,10 @@ void LinkerDriver::link(ArrayRef argsArr) { config->wordsize = config->is64() ? 8 : 4; // Handle /safeseh, x86 only, on by default, except for mingw. - if (config->machine == I386 && - args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw)) - config->safeSEH = true; + if (config->machine == I386) { + config->safeSEH = args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw); + config->noSEH = args.hasArg(OPT_noseh); + } // Handle /functionpadmin for (auto *arg : args.filtered(OPT_functionpadmin, OPT_functionpadmin_opt)) @@ -2023,8 +2096,24 @@ void LinkerDriver::link(ArrayRef argsArr) { // Handle /order. We want to do this at this moment because we // need a complete list of comdat sections to warn on nonexistent // functions. - if (auto *arg = args.getLastArg(OPT_order)) + if (auto *arg = args.getLastArg(OPT_order)) { + if (args.hasArg(OPT_call_graph_ordering_file)) + error("/order and /call-graph-order-file may not be used together"); parseOrderFile(arg->getValue()); + config->callGraphProfileSort = false; + } + + // Handle /call-graph-ordering-file and /call-graph-profile-sort (default on). + if (config->callGraphProfileSort) { + if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) { + parseCallGraphFile(arg->getValue()); + } + readCallGraphsFromObjectFiles(); + } + + // Handle /print-symbol-order. + if (auto *arg = args.getLastArg(OPT_print_symbol_order)) + config->printSymbolOrder = arg->getValue(); // Identify unreferenced COMDAT sections. if (config->doGC) diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 0adc2b91bd999..0bcc6c940bbab 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -249,6 +249,11 @@ SectionChunk *ObjFile::readSection(uint32_t sectionNumber, return nullptr; } + if (name == ".llvm.call-graph-profile") { + callgraphSec = sec; + return nullptr; + } + // Object files may have DWARF debug info or MS CodeView debug info // (or both). // @@ -348,13 +353,13 @@ void ObjFile::recordPrevailingSymbolForMingw( // of the section chunk we actually include instead of discarding it, // add the symbol to a map to allow using it for implicitly // associating .[px]data$ sections to it. + // Use the suffix from the .text$ instead of the leader symbol + // name, for cases where the names differ (i386 mangling/decorations, + // cases where the leader is a weak symbol named .weak.func.default*). int32_t sectionNumber = sym.getSectionNumber(); SectionChunk *sc = sparseChunks[sectionNumber]; if (sc && sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE) { - StringRef name; - name = check(coffObj->getSymbolName(sym)); - if (getMachineType() == I386) - name.consume_front("_"); + StringRef name = sc->getSectionName().split('$').second; prevailingSectionMap[name] = sectionNumber; } } diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index 50323f596e2cc..1e0b97a82be29 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -191,6 +191,8 @@ class ObjFile : public InputFile { const coff_section *addrsigSec = nullptr; + const coff_section *callgraphSec = nullptr; + // When using Microsoft precompiled headers, this is the PCH's key. // The same key is used by both the precompiled object, and objects using the // precompiled object. Any difference indicates out-of-date objects. diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp index bded985f04d0f..e24cdca6ee34c 100644 --- a/lld/COFF/MinGW.cpp +++ b/lld/COFF/MinGW.cpp @@ -34,6 +34,11 @@ AutoExporter::AutoExporter() { "libclang_rt.builtins-arm", "libclang_rt.builtins-i386", "libclang_rt.builtins-x86_64", + "libclang_rt.profile", + "libclang_rt.profile-aarch64", + "libclang_rt.profile-arm", + "libclang_rt.profile-i386", + "libclang_rt.profile-x86_64", "libc++", "libc++abi", "libunwind", @@ -57,6 +62,10 @@ AutoExporter::AutoExporter() { "__builtin_", // Artificial symbols such as .refptr ".", + // profile generate symbols + "__profc_", + "__profd_", + "__profvp_", }; excludeSymbolSuffixes = { diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td index 212879e1d60bd..d1badf0fdd2f0 100644 --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -204,6 +204,7 @@ def include_optional : Joined<["/", "-", "/?", "-?"], "includeoptional:">, HelpText<"Add symbol as undefined, but allow it to remain undefined">; def kill_at : F<"kill-at">; def lldmingw : F<"lldmingw">; +def noseh : F<"noseh">; def output_def : Joined<["/", "-", "/?", "-?"], "output-def:">; def pdb_source_path : P<"pdbsourcepath", "Base path used to make relative source file path absolute in PDB">; @@ -234,6 +235,17 @@ def dash_dash_version : Flag<["--"], "version">, def threads : P<"threads", "Number of threads. '1' disables multi-threading. By " "default all available hardware threads are used">; +def call_graph_ordering_file: P< + "call-graph-ordering-file", + "Layout sections to optimize the given callgraph">; +defm call_graph_profile_sort: B< + "call-graph-profile-sort", + "Reorder sections with call graph profile (default)", + "Do not reorder sections with call graph profile">; +def print_symbol_order: P< + "print-symbol-order", + "Print a symbol order specified by /call-graph-ordering-file and " + "/call-graph-profile-sort into the specified file">; // Flags for debugging def lldmap : F<"lldmap">; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 3bcc1777f7ac8..36ecdcd13512c 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "Writer.h" +#include "CallGraphSort.h" #include "Config.h" #include "DLL.h" #include "InputFiles.h" @@ -229,6 +230,7 @@ class Writer { void setSectionPermissions(); void writeSections(); void writeBuildId(); + void sortSections(); void sortExceptionTable(); void sortCRTSectionChunks(std::vector &chunks); void addSyntheticIdata(); @@ -798,6 +800,19 @@ static bool shouldStripSectionSuffix(SectionChunk *sc, StringRef name) { name.startswith(".xdata$") || name.startswith(".eh_frame$"); } +void Writer::sortSections() { + if (!config->callGraphProfile.empty()) { + DenseMap order = computeCallGraphProfileOrder(); + for (auto it : order) { + if (DefinedRegular *sym = it.first->sym) + config->order[sym->getName()] = it.second; + } + } + if (!config->order.empty()) + for (auto it : partialSections) + sortBySectionOrder(it.second->chunks); +} + // Create output section objects and add them to OutputSections. void Writer::createSections() { // First, create the builtin sections. @@ -861,10 +876,7 @@ void Writer::createSections() { if (hasIdata) addSyntheticIdata(); - // Process an /order option. - if (!config->order.empty()) - for (auto it : partialSections) - sortBySectionOrder(it.second->chunks); + sortSections(); if (hasIdata) locateImportTables(); @@ -1393,7 +1405,7 @@ template void Writer::writeHeader() { pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_GUARD_CF; if (config->integrityCheck) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY; - if (setNoSEHCharacteristic) + if (setNoSEHCharacteristic || config->noSEH) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_NO_SEH; if (config->terminalServerAware) pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE; diff --git a/lld/Common/Args.cpp b/lld/Common/Args.cpp index 4ea3a435c7ae0..507830f9da346 100644 --- a/lld/Common/Args.cpp +++ b/lld/Common/Args.cpp @@ -26,14 +26,14 @@ CodeGenOpt::Level lld::args::getCGOptLevel(int optLevelLTO) { return CodeGenOpt::Default; } -int64_t lld::args::getInteger(opt::InputArgList &args, unsigned key, - int64_t Default) { +static int64_t getInteger(opt::InputArgList &args, unsigned key, + int64_t Default, unsigned base) { auto *a = args.getLastArg(key); if (!a) return Default; int64_t v; - if (to_integer(a->getValue(), v, 10)) + if (to_integer(a->getValue(), v, base)) return v; StringRef spelling = args.getArgString(a->getIndex()); @@ -41,6 +41,16 @@ int64_t lld::args::getInteger(opt::InputArgList &args, unsigned key, return 0; } +int64_t lld::args::getInteger(opt::InputArgList &args, unsigned key, + int64_t Default) { + return ::getInteger(args, key, Default, 10); +} + +int64_t lld::args::getHex(opt::InputArgList &args, unsigned key, + int64_t Default) { + return ::getInteger(args, key, Default, 16); +} + std::vector lld::args::getStrings(opt::InputArgList &args, int id) { std::vector v; for (auto *arg : args.filtered(id)) diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index da0b510d96a10..c1ad72eaa2c9d 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -1035,11 +1035,6 @@ bool PPC64::needsThunk(RelExpr expr, RelType type, const InputFile *file, type != R_PPC64_REL24_NOTOC) return false; - // FIXME: Remove the fatal error once the call protocol is implemented. - if (type == R_PPC64_REL24_NOTOC && s.isInPlt()) - fatal("unimplemented feature: external function call with the reltype" - " R_PPC64_REL24_NOTOC"); - // If a function is in the Plt it needs to be called with a call-stub. if (s.isInPlt()) return true; diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp index 21c641b5161f8..15da4d2414ab4 100644 --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -68,7 +68,7 @@ struct Cluster { int next; int prev; - size_t size = 0; + uint64_t size; uint64_t weight = 0; uint64_t initialWeight = 0; Edge bestPred = {-1, 0}; @@ -223,14 +223,14 @@ DenseMap CallGraphSort::run() { DenseMap orderMap; int curOrder = 1; - for (int leader : sorted) + for (int leader : sorted) { for (int i = leader;;) { orderMap[sections[i]] = curOrder++; i = clusters[i].next; if (i == leader) break; } - + } if (!config->printSymbolOrder.empty()) { std::error_code ec; raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None); diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index e74a4a0c5b223..1afeee02efb30 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -10,7 +10,9 @@ #define LLD_ELF_CONFIG_H #include "lld/Common/ErrorHandler.h" +#include "llvm/ADT/CachedHashString.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" #include "llvm/BinaryFormat/ELF.h" @@ -90,11 +92,13 @@ struct Configuration { uint8_t osabi = 0; uint32_t andFeatures = 0; llvm::CachePruningPolicy thinLTOCachePolicy; + llvm::SetVector dependencyFiles; // for --dependency-file llvm::StringMap sectionStartMap; llvm::StringRef bfdname; llvm::StringRef chroot; - llvm::StringRef dynamicLinker; + llvm::StringRef dependencyFile; llvm::StringRef dwoDir; + llvm::StringRef dynamicLinker; llvm::StringRef entry; llvm::StringRef emulation; llvm::StringRef fini; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 30bff945f1a54..012da1485acb1 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -495,6 +495,9 @@ void LinkerDriver::main(ArrayRef argsArr) { tar = std::move(*errOrWriter); tar->append("response.txt", createResponseFile(args)); tar->append("version.txt", getLLDVersion() + "\n"); + StringRef ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile); + if (!ltoSampleProfile.empty()) + readFile(ltoSampleProfile); } else { error("--reproduce: " + toString(errOrWriter.takeError())); } @@ -915,6 +918,7 @@ static void readConfigs(opt::InputArgList &args) { config->optimizeBBJumps = args.hasFlag(OPT_optimize_bb_jumps, OPT_no_optimize_bb_jumps, false); config->demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true); + config->dependencyFile = args.getLastArgValue(OPT_dependency_file); config->dependentLibraries = args.hasFlag(OPT_dependent_libraries, OPT_no_dependent_libraries, true); config->disableVerify = args.hasArg(OPT_disable_verify); config->discard = getDiscard(args); @@ -962,10 +966,10 @@ static void readConfigs(opt::InputArgList &args) { config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); config->ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile); config->ltoBasicBlockSections = - args.getLastArgValue(OPT_lto_basicblock_sections); + args.getLastArgValue(OPT_lto_basic_block_sections); config->ltoUniqueBasicBlockSectionNames = - args.hasFlag(OPT_lto_unique_bb_section_names, - OPT_no_lto_unique_bb_section_names, false); + args.hasFlag(OPT_lto_unique_basic_block_section_names, + OPT_no_lto_unique_basic_block_section_names, false); config->mapFile = args.getLastArgValue(OPT_Map); config->mipsGotSize = args::getInteger(args, OPT_mips_got_size, 0xfff0); config->mergeArmExidx = @@ -1561,6 +1565,75 @@ static void handleLibcall(StringRef name) { sym->fetch(); } +// Handle --dependency-file=. If that option is given, lld creates a +// file at a given path with the following contents: +// +// : ... +// +// : +// +// where is a pathname of an output file and +// ... is a list of pathnames of all input files. `make` command can read a +// file in the above format and interpret it as a dependency info. We write +// phony targets for every to avoid an error when that file is +// removed. +// +// This option is useful if you want to make your final executable to depend +// on all input files including system libraries. Here is why. +// +// When you write a Makefile, you usually write it so that the final +// executable depends on all user-generated object files. Normally, you +// don't make your executable to depend on system libraries (such as libc) +// because you don't know the exact paths of libraries, even though system +// libraries that are linked to your executable statically are technically a +// part of your program. By using --dependency-file option, you can make +// lld to dump dependency info so that you can maintain exact dependencies +// easily. +static void writeDependencyFile() { + std::error_code ec; + raw_fd_ostream os(config->dependencyFile, ec, sys::fs::F_None); + if (ec) { + error("cannot open " + config->dependencyFile + ": " + ec.message()); + return; + } + + // We use the same escape rules as Clang/GCC which are accepted by Make/Ninja: + // * A space is escaped by a backslash which itself must be escaped. + // * A hash sign is escaped by a single backslash. + // * $ is escapes as $$. + auto printFilename = [](raw_fd_ostream &os, StringRef filename) { + llvm::SmallString<256> nativePath; + llvm::sys::path::native(filename.str(), nativePath); + llvm::sys::path::remove_dots(nativePath, /*remove_dot_dot=*/true); + for (unsigned i = 0, e = nativePath.size(); i != e; ++i) { + if (nativePath[i] == '#') { + os << '\\'; + } else if (nativePath[i] == ' ') { + os << '\\'; + unsigned j = i; + while (j > 0 && nativePath[--j] == '\\') + os << '\\'; + } else if (nativePath[i] == '$') { + os << '$'; + } + os << nativePath[i]; + } + }; + + os << config->outputFile << ":"; + for (StringRef path : config->dependencyFiles) { + os << " \\\n "; + printFilename(os, path); + } + os << "\n"; + + for (StringRef path : config->dependencyFiles) { + os << "\n"; + printFilename(os, path); + os << ":\n"; + } +} + // Replaces common symbols with defined symbols reside in .bss sections. // This function is called after all symbol names are resolved. As a // result, the passes after the symbol resolution won't see any @@ -1779,7 +1852,7 @@ static std::vector addWrappedSymbols(opt::InputArgList &args) { continue; Symbol *real = addUndefined(saver.save("__real_" + name)); - Symbol *wrap = addUndefined(saver.save("__wrap_" + name)); + Symbol *wrap = addUnusedUndefined(saver.save("__wrap_" + name)); v.push_back({sym, real, wrap}); // We want to tell LTO not to inline symbols to be overwritten @@ -1789,7 +1862,8 @@ static std::vector addWrappedSymbols(opt::InputArgList &args) { // Tell LTO not to eliminate these symbols. sym->isUsedInRegularObj = true; - wrap->isUsedInRegularObj = true; + if (wrap->isDefined()) + wrap->isUsedInRegularObj = true; } return v; } @@ -2061,6 +2135,11 @@ template void LinkerDriver::link(opt::InputArgList &args) { return false; }); + // Since we now have a complete set of input files, we can create + // a .d file to record build dependencies. + if (!config->dependencyFile.empty()) + writeDependencyFile(); + // Now that the number of partitions is fixed, save a pointer to the main // partition. mainPart = &partitions[0]; diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index c142c00517ccf..6199f43b466a2 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -110,6 +110,7 @@ Optional elf::readFile(StringRef path) { path = saver.save(config->chroot + path); log(path); + config->dependencyFiles.insert(llvm::CachedHashString(path)); auto mbOrErr = MemoryBuffer::getFile(path, -1, false); if (auto ec = mbOrErr.getError()) { diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index b8041afed6c96..ae77fadcc78d3 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -76,7 +76,7 @@ static lto::Config createConfig() { c.Options.DataSections = true; // Check if basic block sections must be used. - // Allowed values for --lto-basicblock-sections are "all", "labels", + // Allowed values for --lto-basic-block-sections are "all", "labels", // "", or none. This is the equivalent // of -fbasic-block-sections= flag in clang. if (!config->ltoBasicBlockSections.empty()) { diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index c3c1309aca1a7..c3cadafdccd28 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -132,6 +132,9 @@ defm demangle: B<"demangle", "Demangle symbol names (default)", "Do not demangle symbol names">; +defm dependency_file: EEq<"dependency-file", "Write a dependency file">, + MetaVarName<"">; + def disable_new_dtags: F<"disable-new-dtags">, HelpText<"Disable new dynamic tags">; @@ -545,9 +548,9 @@ def opt_remarks_with_hotness: FF<"opt-remarks-with-hotness">, def opt_remarks_format: Separate<["--"], "opt-remarks-format">, HelpText<"The format used for serializing remarks (default: YAML)">; def save_temps: F<"save-temps">; -def lto_basicblock_sections: JJ<"lto-basicblock-sections=">, +def lto_basic_block_sections: JJ<"lto-basic-block-sections=">, HelpText<"Enable basic block sections for LTO">; -defm lto_unique_bb_section_names: BB<"lto-unique-bb-section-names", +defm lto_unique_basic_block_section_names: BB<"lto-unique-basic-block-section-names", "Give unique names to every basic block section for LTO", "Do not give unique names to every basic block section for LTO (default)">; def shuffle_sections: JJ<"shuffle-sections=">, MetaVarName<"">, diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index fea6b7a274e77..eae1d17b2f43e 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" #include "llvm/Support/ScopedPrinter.h" #include @@ -1310,7 +1311,10 @@ Expr ScriptParser::readPrimary() { } if (tok == "DEFINED") { StringRef name = readParenLiteral(); - return [=] { return symtab->find(name) ? 1 : 0; }; + return [=] { + Symbol *b = symtab->find(name); + return (b && b->isDefined()) ? 1 : 0; + }; } if (tok == "LENGTH") { StringRef name = readParenLiteral(); @@ -1329,6 +1333,15 @@ Expr ScriptParser::readPrimary() { return cmd->getLMA(); }; } + if (tok == "LOG2CEIL") { + expect("("); + Expr a = readExpr(); + expect(")"); + return [=] { + // LOG2CEIL(0) is defined to be 0. + return llvm::Log2_64_Ceil(std::max(a().getValue(), UINT64_C(1))); + }; + } if (tok == "MAX" || tok == "MIN") { expect("("); Expr a = readExpr(); diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp index 091a291b26cd4..cbdb286557f8d 100644 --- a/lld/ELF/Thunks.cpp +++ b/lld/ELF/Thunks.cpp @@ -305,6 +305,21 @@ class PPC64R12SetupStub final : public Thunk { void addSymbols(ThunkSection &isec) override; }; +// PPC64 PC-relative PLT Stub +// When a caller that does not maintain a toc-pointer performs an extern call +// then this stub is needed for: +// 1) Loading the target functions address from the procedure linkage table into +// r12 for use by the target functions global entry point, and into the count +// register with pc-relative instructions. +// 2) Transferring control to the target function through an indirect branch. +class PPC64PCRelPLTStub final : public Thunk { +public: + PPC64PCRelPLTStub(Symbol &dest) : Thunk(dest, 0) {} + uint32_t size() override { return 16; } + void writeTo(uint8_t *buf) override; + void addSymbols(ThunkSection &isec) override; +}; + // A bl instruction uses a signed 24 bit offset, with an implicit 4 byte // alignment. This gives a possible 26 bits of 'reach'. If the call offset is // larger then that we need to emit a long-branch thunk. The target address @@ -880,6 +895,23 @@ void PPC64R12SetupStub::addSymbols(ThunkSection &isec) { isec); } +void PPC64PCRelPLTStub::writeTo(uint8_t *buf) { + int64_t offset = destination.getGotPltVA() - getThunkTargetSym()->getVA(); + if (!isInt<34>(offset)) + fatal("offset must fit in 34 bits to encode in the instruction"); + uint64_t pld = + PLD_R12_NO_DISP | (((offset >> 16) & 0x3ffff) << 32) | (offset & 0xffff); + + writePrefixedInstruction(buf + 0, pld); // pld r12, func@plt@pcrel + write32(buf + 8, MTCTR_R12); // mtctr r12 + write32(buf + 12, BCTR); // bctr +} + +void PPC64PCRelPLTStub::addSymbols(ThunkSection &isec) { + addSymbol(saver.save("__plt_pcrel_" + destination.getName()), STT_FUNC, 0, + isec); +} + void PPC64LongBranchThunk::writeTo(uint8_t *buf) { int64_t offset = in.ppc64LongBranchTarget->getEntryVA(&destination, addend) - getPPC64TocBase(); @@ -1007,7 +1039,8 @@ static Thunk *addThunkPPC64(RelType type, Symbol &s, int64_t a) { type == R_PPC64_REL24_NOTOC) && "unexpected relocation type for thunk"); if (s.isInPlt()) - return make(s); + return type == R_PPC64_REL24_NOTOC ? (Thunk *)make(s) + : (Thunk *)make(s); // This check looks at the st_other bits of the callee. If the value is 1 // then the callee clobbers the TOC and we need an R2 save stub. diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp index 36f686ca2f1d8..458dad805b4a2 100644 --- a/lld/MachO/Arch/X86_64.cpp +++ b/lld/MachO/Arch/X86_64.cpp @@ -218,6 +218,7 @@ void X86_64::prepareSymbolRelocation(lld::macho::Symbol &sym, in.got->addEntry(sym); break; case X86_64_RELOC_BRANCH: { + // TODO: weak dysyms should go into the weak binding section instead if (auto *dysym = dyn_cast(&sym)) in.stubs->addEntry(*dysym); break; diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index 79812a4335638..f65ff7c5c1047 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -25,12 +25,12 @@ struct SymbolPriorityEntry; struct Configuration { Symbol *entry; bool hasReexports = false; + uint32_t headerPad; llvm::StringRef installName; llvm::StringRef outputFile; llvm::MachO::Architecture arch; llvm::MachO::HeaderFileType outputType; std::vector librarySearchPaths; - // TODO: use the framework search paths std::vector frameworkSearchPaths; llvm::DenseMap priorities; }; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 4dfb387e4e622..6b25cd55ccfc7 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -13,6 +13,7 @@ #include "OutputSegment.h" #include "SymbolTable.h" #include "Symbols.h" +#include "SyntheticSections.h" #include "Target.h" #include "Writer.h" @@ -30,6 +31,7 @@ #include "llvm/Object/Archive.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/Option.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/Host.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" @@ -98,6 +100,32 @@ static Optional findLibrary(StringRef name) { return {}; } +static Optional findFramework(StringRef name) { + // TODO: support .tbd files + llvm::SmallString<260> symlink; + llvm::SmallString<260> location; + StringRef suffix; + std::tie(name, suffix) = name.split(","); + for (StringRef dir : config->frameworkSearchPaths) { + symlink = dir; + path::append(symlink, name + ".framework", name); + // If the symlink fails to resolve, skip to the next search path. + // NOTE: we must resolve the symlink before trying the suffixes, because + // there are no symlinks for the suffixed paths. + if (fs::real_path(symlink, location)) + continue; + if (!suffix.empty()) { + llvm::Twine suffixed = location + suffix; + if (fs::exists(suffixed)) + return suffixed.str(); + // Suffix lookup failed, fall through to the no-suffix case. + } + if (fs::exists(location)) + return location.str().str(); + } + return {}; +} + static TargetInfo *createTargetInfo(opt::InputArgList &args) { StringRef arch = args.getLastArgValue(OPT_arch, "x86_64"); config->arch = llvm::MachO::getArchitectureFromName( @@ -186,6 +214,15 @@ static void addFile(StringRef path) { } } +static void addFileList(StringRef path) { + Optional buffer = readFile(path); + if (!buffer) + return; + MemoryBufferRef mbref = *buffer; + for (StringRef path : args::getLines(mbref)) + addFile(path); +} + static std::array archNames{"arm", "arm64", "i386", "x86_64", "ppc", "ppc64"}; static bool isArchString(StringRef s) { @@ -359,6 +396,7 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, config->outputFile = args.getLastArgValue(OPT_o, "a.out"); config->installName = args.getLastArgValue(OPT_install_name, config->outputFile); + config->headerPad = args::getHex(args, OPT_headerpad, /*Default=*/32); getLibrarySearchPaths(config->librarySearchPaths, args); getFrameworkSearchPaths(config->frameworkSearchPaths, args); config->outputType = args.hasArg(OPT_dylib) ? MH_DYLIB : MH_EXECUTE; @@ -384,6 +422,9 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, case OPT_INPUT: addFile(arg->getValue()); break; + case OPT_filelist: + addFileList(arg->getValue()); + break; case OPT_l: { StringRef name = arg->getValue(); if (Optional path = findLibrary(name)) { @@ -393,13 +434,25 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, error("library not found for -l" + name); break; } + case OPT_framework: { + StringRef name = arg->getValue(); + if (Optional path = findFramework(name)) { + addFile(*path); + break; + } + error("framework not found for -framework " + name); + break; + } case OPT_platform_version: handlePlatformVersion(arg); break; case OPT_o: case OPT_dylib: case OPT_e: + case OPT_F: case OPT_L: + case OPT_headerpad: + case OPT_install_name: case OPT_Z: case OPT_arch: // handled elsewhere @@ -429,6 +482,7 @@ bool macho::link(llvm::ArrayRef argsArr, bool canExitEarly, } createSyntheticSections(); + symtab->addDSOHandle(in.header); // Initialize InputSections. for (InputFile *file : inputFiles) { diff --git a/lld/MachO/ExportTrie.cpp b/lld/MachO/ExportTrie.cpp index 7cc81bcfd5f18..993a552435325 100644 --- a/lld/MachO/ExportTrie.cpp +++ b/lld/MachO/ExportTrie.cpp @@ -59,6 +59,10 @@ struct Edge { struct ExportInfo { uint64_t address; + uint8_t flags; + explicit ExportInfo(const Symbol &sym) + : address(sym.getVA()), + flags(sym.isWeakDef() ? EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION : 0) {} // TODO: Add proper support for re-exports & stub-and-resolver flags. }; @@ -83,9 +87,8 @@ bool TrieNode::updateOffset(size_t &nextOffset) { // node. size_t nodeSize; if (info) { - uint64_t flags = 0; uint32_t terminalSize = - getULEB128Size(flags) + getULEB128Size(info->address); + getULEB128Size(info->flags) + getULEB128Size(info->address); // Overall node size so far is the uleb128 size of the length of the symbol // info + the symbol info itself. nodeSize = terminalSize + getULEB128Size(terminalSize); @@ -110,11 +113,10 @@ void TrieNode::writeTo(uint8_t *buf) const { buf += offset; if (info) { // TrieNodes with Symbol info: size, flags address - uint64_t flags = 0; // TODO: emit proper flags uint32_t terminalSize = - getULEB128Size(flags) + getULEB128Size(info->address); + getULEB128Size(info->flags) + getULEB128Size(info->address); buf += encodeULEB128(terminalSize, buf); - buf += encodeULEB128(flags, buf); + buf += encodeULEB128(info->flags, buf); buf += encodeULEB128(info->address, buf); } else { // TrieNode with no Symbol info. @@ -194,7 +196,7 @@ void TrieBuilder::sortAndBuild(MutableArrayRef vec, if (isTerminal) { assert(j - i == 1); // no duplicate symbols - node->info = {pivotSymbol->getVA()}; + node->info = ExportInfo(*pivotSymbol); } else { // This is the tail-call-optimized version of the following: // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 46fe82f988222..f1afc187aca23 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -228,10 +228,9 @@ void InputFile::parseSymbols(ArrayRef nList, StringRef name = strtab + sym.n_strx; if (sym.n_type & N_EXT) // Global defined symbol - return symtab->addDefined(name, isec, value); - else - // Local defined symbol - return make(name, isec, value); + return symtab->addDefined(name, isec, value, sym.n_desc & N_WEAK_DEF); + // Local defined symbol + return make(name, isec, value, sym.n_desc & N_WEAK_DEF); }; for (size_t i = 0, n = nList.size(); i < n; ++i) { @@ -351,7 +350,9 @@ DylibFile::DylibFile(MemoryBufferRef mb, DylibFile *umbrella) auto *c = reinterpret_cast(cmd); parseTrie(buf + c->export_off, c->export_size, [&](const Twine &name, uint64_t flags) { - symbols.push_back(symtab->addDylib(saver.save(name), umbrella)); + bool isWeakDef = flags & EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; + symbols.push_back( + symtab->addDylib(saver.save(name), umbrella, isWeakDef)); }); } else { error("LC_DYLD_INFO_ONLY not found in " + getName()); @@ -390,10 +391,11 @@ DylibFile::DylibFile(std::shared_ptr interface, dylibName = saver.save(interface->getInstallName()); // TODO(compnerd) filter out symbols based on the target platform + // TODO: handle weak defs for (const auto symbol : interface->symbols()) if (symbol->getArchitectures().has(config->arch)) - symbols.push_back( - symtab->addDylib(saver.save(symbol->getName()), umbrella)); + symbols.push_back(symtab->addDylib(saver.save(symbol->getName()), + umbrella, /*isWeakDef=*/false)); // TODO(compnerd) properly represent the hierarchy of the documents as it is // in theory possible to have re-exported dylibs from re-exported dylibs which // should be parent'ed to the child. diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index 1e42542b9ac47..c27ca0d0195f9 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -456,6 +456,11 @@ def map : Separate<["-"], "map">, HelpText<"Writes all symbols and their addresses to ">, Flags<[HelpHidden]>, Group; +def dependency_info : Separate<["-"], "dependency_info">, + MetaVarName<"">, + HelpText<"Dump dependency info">, + Flags<[HelpHidden]>, + Group; def grp_symtab : OptionGroup<"symtab">, HelpText<"SYMBOL TABLE OPTIMIZATIONS">; @@ -688,7 +693,6 @@ def umbrella : Separate<["-"], "umbrella">, def headerpad : Separate<["-"], "headerpad">, MetaVarName<"">, HelpText<"Allocate hex extra space for future expansion of the load commands via install_name_tool">, - Flags<[HelpHidden]>, Group; def headerpad_max_install_names : Flag<["-"], "headerpad_max_install_names">, HelpText<"Allocate extra space so all load-command paths can expand to MAXPATHLEN via install_name_tool">, @@ -1098,10 +1102,6 @@ def demangle : Flag<["-"], "demangle">, HelpText<"This option is undocumented in ld64">, Flags<[HelpHidden]>, Group; -def dependency_info : Flag<["-"], "dependency_info">, - HelpText<"This option is undocumented in ld64">, - Flags<[HelpHidden]>, - Group; def dyld_env : Flag<["-"], "dyld_env">, HelpText<"This option is undocumented in ld64">, Flags<[HelpHidden]>, diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp index 80e870d79890c..1a8a1d5ac0650 100644 --- a/lld/MachO/SymbolTable.cpp +++ b/lld/MachO/SymbolTable.cpp @@ -37,15 +37,23 @@ std::pair SymbolTable::insert(StringRef name) { } Symbol *SymbolTable::addDefined(StringRef name, InputSection *isec, - uint32_t value) { + uint32_t value, bool isWeakDef) { Symbol *s; bool wasInserted; std::tie(s, wasInserted) = insert(name); - if (!wasInserted && isa(s)) - error("duplicate symbol: " + name); - - replaceSymbol(s, name, isec, value); + if (!wasInserted) { + if (auto *defined = dyn_cast(s)) { + if (isWeakDef) + return s; + if (!defined->isWeakDef()) + error("duplicate symbol: " + name); + } + // Defined symbols take priority over other types of symbols, so in case + // of a name conflict, we fall through to the replaceSymbol() call below. + } + + replaceSymbol(s, name, isec, value, isWeakDef); return s; } @@ -61,13 +69,15 @@ Symbol *SymbolTable::addUndefined(StringRef name) { return s; } -Symbol *SymbolTable::addDylib(StringRef name, DylibFile *file) { +Symbol *SymbolTable::addDylib(StringRef name, DylibFile *file, bool isWeakDef) { Symbol *s; bool wasInserted; std::tie(s, wasInserted) = insert(name); - if (wasInserted || isa(s)) - replaceSymbol(s, file, name); + if (wasInserted || isa(s) || + (isa(s) && !isWeakDef && s->isWeakDef())) + replaceSymbol(s, file, name, isWeakDef); + return s; } @@ -79,9 +89,22 @@ Symbol *SymbolTable::addLazy(StringRef name, ArchiveFile *file, if (wasInserted) replaceSymbol(s, file, sym); - else if (isa(s)) + else if (isa(s) || (isa(s) && s->isWeakDef())) file->fetch(sym); return s; } +Symbol *SymbolTable::addDSOHandle(const MachHeaderSection *header) { + Symbol *s; + bool wasInserted; + std::tie(s, wasInserted) = insert(DSOHandle::name); + if (!wasInserted) { + if (auto *defined = dyn_cast(s)) + error("found defined symbol from " + defined->isec->file->getName() + + " with illegal name " + DSOHandle::name); + } + replaceSymbol(s, header); + return s; +} + SymbolTable *macho::symtab; diff --git a/lld/MachO/SymbolTable.h b/lld/MachO/SymbolTable.h index 2379008db56da..822eb5b35dac5 100644 --- a/lld/MachO/SymbolTable.h +++ b/lld/MachO/SymbolTable.h @@ -20,19 +20,29 @@ namespace macho { class ArchiveFile; class DylibFile; class InputSection; +class MachHeaderSection; class Symbol; +/* + * Note that the SymbolTable handles name collisions by calling + * replaceSymbol(), which does an in-place update of the Symbol via `placement + * new`. Therefore, there is no need to update any relocations that hold + * pointers the "old" Symbol -- they will automatically point to the new one. + */ class SymbolTable { public: - Symbol *addDefined(StringRef name, InputSection *isec, uint32_t value); + Symbol *addDefined(StringRef name, InputSection *isec, uint32_t value, + bool isWeakDef); Symbol *addUndefined(StringRef name); - Symbol *addDylib(StringRef name, DylibFile *file); + Symbol *addDylib(StringRef name, DylibFile *file, bool isWeakDef); Symbol *addLazy(StringRef name, ArchiveFile *file, const llvm::object::Archive::Symbol &sym); + Symbol *addDSOHandle(const MachHeaderSection *); + ArrayRef getSymbols() const { return symVector; } Symbol *find(StringRef name); diff --git a/lld/MachO/Symbols.cpp b/lld/MachO/Symbols.cpp index fbafa8a92a4fa..af5d9d25029f5 100644 --- a/lld/MachO/Symbols.cpp +++ b/lld/MachO/Symbols.cpp @@ -8,6 +8,7 @@ #include "Symbols.h" #include "InputFiles.h" +#include "SyntheticSections.h" using namespace llvm; using namespace lld; @@ -21,3 +22,9 @@ std::string lld::toString(const Symbol &sym) { return *s; return std::string(sym.getName()); } + +uint64_t DSOHandle::getVA() const { return header->addr; } + +uint64_t DSOHandle::getFileOffset() const { return header->fileOff; } + +constexpr StringRef DSOHandle::name; diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h index 63748ee483245..1e0767a6a12b9 100644 --- a/lld/MachO/Symbols.h +++ b/lld/MachO/Symbols.h @@ -19,6 +19,7 @@ namespace lld { namespace macho { class InputSection; +class MachHeaderSection; class DylibFile; class ArchiveFile; @@ -37,15 +38,22 @@ class Symbol { UndefinedKind, DylibKind, LazyKind, + DSOHandleKind, }; + virtual ~Symbol() {} + Kind kind() const { return static_cast(symbolKind); } StringRef getName() const { return {name.data, name.size}; } - uint64_t getVA() const; + virtual uint64_t getVA() const { return 0; } + + virtual uint64_t getFileOffset() const { + llvm_unreachable("attempt to get an offset from a non-defined symbol"); + } - uint64_t getFileOffset() const; + virtual bool isWeakDef() const { llvm_unreachable("cannot be weak"); } uint32_t gotIndex = UINT32_MAX; @@ -58,13 +66,25 @@ class Symbol { class Defined : public Symbol { public: - Defined(StringRefZ name, InputSection *isec, uint32_t value) - : Symbol(DefinedKind, name), isec(isec), value(value) {} + Defined(StringRefZ name, InputSection *isec, uint32_t value, bool isWeakDef) + : Symbol(DefinedKind, name), isec(isec), value(value), + weakDef(isWeakDef) {} + + bool isWeakDef() const override { return weakDef; } + + static bool classof(const Symbol *s) { return s->kind() == DefinedKind; } + + uint64_t getVA() const override { return isec->getVA() + value; } + + uint64_t getFileOffset() const override { + return isec->getFileOffset() + value; + } InputSection *isec; uint32_t value; - static bool classof(const Symbol *s) { return s->kind() == DefinedKind; } +private: + const bool weakDef; }; class Undefined : public Symbol { @@ -76,14 +96,19 @@ class Undefined : public Symbol { class DylibSymbol : public Symbol { public: - DylibSymbol(DylibFile *file, StringRefZ name) - : Symbol(DylibKind, name), file(file) {} + DylibSymbol(DylibFile *file, StringRefZ name, bool isWeakDef) + : Symbol(DylibKind, name), file(file), weakDef(isWeakDef) {} + + bool isWeakDef() const override { return weakDef; } static bool classof(const Symbol *s) { return s->kind() == DylibKind; } DylibFile *file; uint32_t stubsIndex = UINT32_MAX; uint32_t lazyBindOffset = UINT32_MAX; + +private: + const bool weakDef; }; class LazySymbol : public Symbol { @@ -100,17 +125,32 @@ class LazySymbol : public Symbol { const llvm::object::Archive::Symbol sym; }; -inline uint64_t Symbol::getVA() const { - if (auto *d = dyn_cast(this)) - return d->isec->getVA() + d->value; - return 0; -} +// The Itanium C++ ABI requires dylibs to pass a pointer to __cxa_atexit which +// does e.g. cleanup of static global variables. The ABI document says that the +// pointer can point to any address in one of the dylib's segments, but in +// practice ld64 seems to set it to point to the header, so that's what's +// implemented here. +// +// The ARM C++ ABI uses __dso_handle similarly, but I (int3) have not yet +// tested this on an ARM platform. +// +// DSOHandle effectively functions like a Defined symbol, but it doesn't belong +// to an InputSection. +class DSOHandle : public Symbol { +public: + DSOHandle(const MachHeaderSection *header) + : Symbol(DSOHandleKind, name), header(header) {} -inline uint64_t Symbol::getFileOffset() const { - if (auto *d = dyn_cast(this)) - return d->isec->getFileOffset() + d->value; - llvm_unreachable("attempt to get an offset from an undefined symbol"); -} + const MachHeaderSection *header; + + uint64_t getVA() const override; + + uint64_t getFileOffset() const override; + + static constexpr StringRef name = "___dso_handle"; + + static bool classof(const Symbol *s) { return s->kind() == DefinedKind; } +}; union SymbolUnion { alignas(Defined) char a[sizeof(Defined)]; diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index cc0d5a93c40de..ac772da0129f8 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -47,7 +47,7 @@ void MachHeaderSection::addLoadCommand(LoadCommand *lc) { } uint64_t MachHeaderSection::getSize() const { - return sizeof(MachO::mach_header_64) + sizeOfCmds; + return sizeof(MachO::mach_header_64) + sizeOfCmds + config->headerPad; } void MachHeaderSection::writeTo(uint8_t *buf) const { @@ -94,7 +94,7 @@ void GotSection::writeTo(uint8_t *buf) const { } BindingSection::BindingSection() - : SyntheticSection(segment_names::linkEdit, section_names::binding) {} + : LinkEditSection(segment_names::linkEdit, section_names::binding) {} bool BindingSection::isNeeded() const { return bindings.size() != 0 || in.got->isNeeded(); @@ -264,7 +264,8 @@ void StubHelperSection::setup() { in.got->addEntry(*stubBinder); inputSections.push_back(in.imageLoaderCache); - symtab->addDefined("__dyld_private", in.imageLoaderCache, 0); + symtab->addDefined("__dyld_private", in.imageLoaderCache, 0, + /*isWeakDef=*/false); } ImageLoaderCacheSection::ImageLoaderCacheSection() { @@ -300,7 +301,7 @@ void LazyPointerSection::writeTo(uint8_t *buf) const { } LazyBindingSection::LazyBindingSection() - : SyntheticSection(segment_names::linkEdit, section_names::lazyBinding) {} + : LinkEditSection(segment_names::linkEdit, section_names::lazyBinding) {} bool LazyBindingSection::isNeeded() const { return in.stubs->isNeeded(); } @@ -343,7 +344,7 @@ uint32_t LazyBindingSection::encode(const DylibSymbol &sym) { } ExportSection::ExportSection() - : SyntheticSection(segment_names::linkEdit, section_names::export_) {} + : LinkEditSection(segment_names::linkEdit, section_names::export_) {} void ExportSection::finalizeContents() { // TODO: We should check symbol visibility. @@ -357,11 +358,7 @@ void ExportSection::writeTo(uint8_t *buf) const { trieBuilder.writeTo(buf); } SymtabSection::SymtabSection(StringTableSection &stringTableSection) : SyntheticSection(segment_names::linkEdit, section_names::symbolTable), - stringTableSection(stringTableSection) { - // TODO: When we introduce the SyntheticSections superclass, we should make - // all synthetic sections aligned to WordSize by default. - align = WordSize; -} + stringTableSection(stringTableSection) {} uint64_t SymtabSection::getSize() const { return symbols.size() * sizeof(structs::nlist_64); @@ -391,7 +388,7 @@ void SymtabSection::writeTo(uint8_t *buf) const { } StringTableSection::StringTableSection() - : SyntheticSection(segment_names::linkEdit, section_names::stringTable) {} + : LinkEditSection(segment_names::linkEdit, section_names::stringTable) {} uint32_t StringTableSection::addString(StringRef str) { uint32_t strx = size; diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h index a8fbf6c8a2657..db2ffbe57d442 100644 --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -49,6 +49,27 @@ class SyntheticSection : public OutputSection { const StringRef segname; }; +// All sections in __LINKEDIT should inherit from this. +class LinkEditSection : public SyntheticSection { +public: + LinkEditSection(const char *segname, const char *name) + : SyntheticSection(segname, name) { + align = WordSize; + } + + virtual uint64_t getRawSize() const = 0; + + // codesign (or more specifically libstuff) checks that each section in + // __LINKEDIT ends where the next one starts -- no gaps are permitted. We + // therefore align every section's start and end points to WordSize. + // + // NOTE: This assumes that the extra bytes required for alignment can be + // zero-valued bytes. + uint64_t getSize() const override final { + return llvm::alignTo(getRawSize(), WordSize); + } +}; + // The header of the Mach-O file, which must have a file offset of zero. class MachHeaderSection : public SyntheticSection { public: @@ -105,11 +126,11 @@ struct BindingEntry { }; // Stores bind opcodes for telling dyld which symbols to load non-lazily. -class BindingSection : public SyntheticSection { +class BindingSection : public LinkEditSection { public: BindingSection(); void finalizeContents(); - uint64_t getSize() const override { return contents.size(); } + uint64_t getRawSize() const override { return contents.size(); } // Like other sections in __LINKEDIT, the binding section is special: its // offsets are recorded in the LC_DYLD_INFO_ONLY load command, instead of in // section headers. @@ -194,11 +215,11 @@ class LazyPointerSection : public SyntheticSection { void writeTo(uint8_t *buf) const override; }; -class LazyBindingSection : public SyntheticSection { +class LazyBindingSection : public LinkEditSection { public: LazyBindingSection(); void finalizeContents(); - uint64_t getSize() const override { return contents.size(); } + uint64_t getRawSize() const override { return contents.size(); } uint32_t encode(const DylibSymbol &); // Like other sections in __LINKEDIT, the lazy binding section is special: its // offsets are recorded in the LC_DYLD_INFO_ONLY load command, instead of in @@ -213,11 +234,11 @@ class LazyBindingSection : public SyntheticSection { }; // Stores a trie that describes the set of exported symbols. -class ExportSection : public SyntheticSection { +class ExportSection : public LinkEditSection { public: ExportSection(); void finalizeContents(); - uint64_t getSize() const override { return size; } + uint64_t getRawSize() const override { return size; } // Like other sections in __LINKEDIT, the export section is special: its // offsets are recorded in the LC_DYLD_INFO_ONLY load command, instead of in // section headers. @@ -230,12 +251,12 @@ class ExportSection : public SyntheticSection { }; // Stores the strings referenced by the symbol table. -class StringTableSection : public SyntheticSection { +class StringTableSection : public LinkEditSection { public: StringTableSection(); // Returns the start offset of the added string. uint32_t addString(StringRef); - uint64_t getSize() const override { return size; } + uint64_t getRawSize() const override { return size; } // Like other sections in __LINKEDIT, the string table section is special: its // offsets are recorded in the LC_SYMTAB load command, instead of in section // headers. @@ -273,6 +294,7 @@ class SymtabSection : public SyntheticSection { }; struct InStruct { + MachHeaderSection *header = nullptr; BindingSection *binding = nullptr; GotSection *got = nullptr; LazyPointerSection *lazyPointers = nullptr; diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index 03000a7f437e0..886ca482136b7 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -53,7 +53,7 @@ class Writer { std::unique_ptr &buffer; uint64_t addr = 0; uint64_t fileOff = 0; - MachHeaderSection *headerSection = nullptr; + MachHeaderSection *header = nullptr; LazyBindingSection *lazyBindingSection = nullptr; ExportSection *exportSection = nullptr; StringTableSection *stringTableSection = nullptr; @@ -134,7 +134,11 @@ class LCSegment : public LoadCommand { c->nsects = seg->numNonHiddenSections(); for (OutputSection *osec : seg->getSections()) { - c->filesize += osec->getFileSize(); + if (!isZeroFill(osec->flags)) { + assert(osec->fileOff >= seg->fileOff); + c->filesize = std::max( + c->filesize, osec->fileOff + osec->getFileSize() - seg->fileOff); + } if (osec->isHidden()) continue; @@ -260,20 +264,18 @@ void Writer::scanRelocations() { } void Writer::createLoadCommands() { - headerSection->addLoadCommand( + in.header->addLoadCommand( make(in.binding, lazyBindingSection, exportSection)); - headerSection->addLoadCommand( - make(symtabSection, stringTableSection)); - headerSection->addLoadCommand(make()); + in.header->addLoadCommand(make(symtabSection, stringTableSection)); + in.header->addLoadCommand(make()); switch (config->outputType) { case MH_EXECUTE: - headerSection->addLoadCommand(make()); - headerSection->addLoadCommand(make()); + in.header->addLoadCommand(make()); + in.header->addLoadCommand(make()); break; case MH_DYLIB: - headerSection->addLoadCommand( - make(LC_ID_DYLIB, config->installName)); + in.header->addLoadCommand(make(LC_ID_DYLIB, config->installName)); break; default: llvm_unreachable("unhandled output file type"); @@ -281,19 +283,19 @@ void Writer::createLoadCommands() { uint8_t segIndex = 0; for (OutputSegment *seg : outputSegments) { - headerSection->addLoadCommand(make(seg->name, seg)); + in.header->addLoadCommand(make(seg->name, seg)); seg->index = segIndex++; } uint64_t dylibOrdinal = 1; for (InputFile *file : inputFiles) { if (auto *dylibFile = dyn_cast(file)) { - headerSection->addLoadCommand( + in.header->addLoadCommand( make(LC_LOAD_DYLIB, dylibFile->dylibName)); dylibFile->ordinal = dylibOrdinal++; if (dylibFile->reexport) - headerSection->addLoadCommand( + in.header->addLoadCommand( make(LC_REEXPORT_DYLIB, dylibFile->dylibName)); } } @@ -351,7 +353,8 @@ static int sectionOrder(OutputSection *osec) { return -1; } else if (segname == segment_names::linkEdit) { return StringSwitch(osec->name) - .Case(section_names::binding, -4) + .Case(section_names::binding, -5) + .Case(section_names::lazyBinding, -4) .Case(section_names::export_, -3) .Case(section_names::symbolTable, -2) .Case(section_names::stringTable, -1) @@ -402,7 +405,6 @@ static void sortSegmentsAndSections() { void Writer::createOutputSections() { // First, create hidden sections - headerSection = make(); lazyBindingSection = make(); stringTableSection = make(); symtabSection = make(*stringTableSection); @@ -454,6 +456,8 @@ void Writer::assignAddresses(OutputSegment *seg) { seg->fileOff = fileOff; for (auto *osec : seg->getSections()) { + if (!osec->isNeeded()) + continue; addr = alignTo(addr, osec->align); fileOff = alignTo(fileOff, osec->align); osec->addr = addr; @@ -533,6 +537,7 @@ void Writer::run() { void macho::writeResult() { Writer().run(); } void macho::createSyntheticSections() { + in.header = make(); in.binding = make(); in.got = make(); in.lazyPointers = make(); diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp index f33b5e19502c6..d60765c70c095 100644 --- a/lld/MinGW/Driver.cpp +++ b/lld/MinGW/Driver.cpp @@ -288,6 +288,8 @@ bool mingw::link(ArrayRef argsArr, bool canExitEarly, add("-kill-at"); if (args.hasArg(OPT_appcontainer)) add("-appcontainer"); + if (args.hasArg(OPT_no_seh)) + add("-noseh"); if (args.getLastArgValue(OPT_m) != "thumb2pe" && args.getLastArgValue(OPT_m) != "arm64pe" && !args.hasArg(OPT_dynamicbase)) diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td index 3281951dc89dd..fe44166600509 100644 --- a/lld/MinGW/Options.td +++ b/lld/MinGW/Options.td @@ -56,6 +56,7 @@ defm minor_subsystem_version: EqLong<"minor-subsystem-version", "Set the OS and subsystem minor version">; def no_insert_timestamp: F<"no-insert-timestamp">, HelpText<"Don't include PE header timestamp">; +def no_seh: F<"no-seh">, HelpText<"Set the 'no SEH' flag in the executable">; def no_whole_archive: F<"no-whole-archive">, HelpText<"No longer include all object files for following archives">; def large_address_aware: Flag<["--"], "large-address-aware">, @@ -111,7 +112,6 @@ def: Flag<["--"], "full-shutdown">; def: F<"high-entropy-va">; def: S<"major-image-version">; def: S<"minor-image-version">; -def: F<"no-seh">; def: F<"nxcompat">; def: F<"pic-executable">; def: S<"plugin">; diff --git a/lld/include/lld/Common/Args.h b/lld/include/lld/Common/Args.h index b3c8686e57e22..48f7b4079cd97 100644 --- a/lld/include/lld/Common/Args.h +++ b/lld/include/lld/Common/Args.h @@ -28,6 +28,8 @@ llvm::CodeGenOpt::Level getCGOptLevel(int optLevelLTO); int64_t getInteger(llvm::opt::InputArgList &args, unsigned key, int64_t Default); +int64_t getHex(llvm::opt::InputArgList &args, unsigned key, int64_t Default); + std::vector getStrings(llvm::opt::InputArgList &args, int id); uint64_t getZOptionValue(llvm::opt::InputArgList &args, int id, StringRef key, diff --git a/lld/test/CMakeLists.txt b/lld/test/CMakeLists.txt index 753eacf4d255b..52e6118ba876b 100644 --- a/lld/test/CMakeLists.txt +++ b/lld/test/CMakeLists.txt @@ -25,10 +25,10 @@ configure_lit_site_cfg( set(LLD_TEST_DEPS lld) if (NOT LLD_BUILT_STANDALONE) list(APPEND LLD_TEST_DEPS - FileCheck count extract llc llvm-ar llvm-as llvm-bcanalyzer llvm-config llvm-cvtres + FileCheck count llc llvm-ar llvm-as llvm-bcanalyzer llvm-config llvm-cvtres llvm-dis llvm-dwarfdump llvm-lib llvm-lipo llvm-mc llvm-nm llvm-objcopy llvm-objdump llvm-pdbutil llvm-readelf llvm-readobj llvm-strip not obj2yaml - opt yaml2obj + opt split-file yaml2obj ) endif() diff --git a/lld/test/COFF/associative-comdat-mingw-i386.s b/lld/test/COFF/associative-comdat-mingw-i386.s index 3f5e02330d505..3ba8c1cd9a75b 100644 --- a/lld/test/COFF/associative-comdat-mingw-i386.s +++ b/lld/test/COFF/associative-comdat-mingw-i386.s @@ -1,10 +1,14 @@ # REQUIRES: x86 -# RUN: llvm-mc -triple=i686-windows-gnu %s -filetype=obj -o %t.obj +# RUN: llvm-mc -triple=i686-windows-gnu %s -defsym stdcall=0 -filetype=obj -o %t.obj # RUN: lld-link -lldmingw -entry:main %t.obj -out:%t.exe # RUN: llvm-objdump -s %t.exe | FileCheck %s +# RUN: llvm-mc -triple=i686-windows-gnu %s -defsym stdcall=1 -filetype=obj -o %t.stdcall.obj +# RUN: lld-link -lldmingw -entry:main %t.stdcall.obj -out:%t.stdcall.exe +# RUN: llvm-objdump -s %t.stdcall.exe | FileCheck %s + # Check that the .eh_frame comdat was included, even if it had no symbols, # due to associativity with the symbol _foo. @@ -19,19 +23,34 @@ .globl _main .p2align 4, 0x90 _main: +.if stdcall==0 call _foo +.else + call _foo@0 +.endif ret .section .eh_frame$foo,"dr" .linkonce discard .byte 0x42 +.if stdcall==0 .def _foo; +.else + .def _foo@0; +.endif .scl 2; .type 32; .endef - .section .text$foo,"xr",discard,foo +.if stdcall==0 + .section .text$foo,"xr",discard,_foo .globl _foo .p2align 4 _foo: +.else + .section .text$foo,"xr",discard,_foo@0 + .globl _foo@0 + .p2align 4 +_foo@0: +.endif ret diff --git a/lld/test/COFF/associative-comdat-mingw-weak.s b/lld/test/COFF/associative-comdat-mingw-weak.s new file mode 100644 index 0000000000000..80c738b436be4 --- /dev/null +++ b/lld/test/COFF/associative-comdat-mingw-weak.s @@ -0,0 +1,63 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -triple=x86_64-windows-gnu %s -filetype=obj -o %t.obj +# RUN: llvm-readobj --symbols %t.obj | FileCheck %s --check-prefix=SYMBOL + +# RUN: lld-link -lldmingw -entry:main %t.obj -out:%t.exe -lldmap:%t.map -verbose +# RUN: llvm-readobj --sections %t.exe | FileCheck %s + +# CHECK: Sections [ +# CHECK: Section { +# CHECK: Number: 2 +# CHECK-LABEL: Name: .rdata (2E 72 64 61 74 61 00 00) +# This is the critical check to show that .xdata$foo was +# retained, while .xdata$bar wasn't. This *must* be 0x24 +# (0x4 for the .xdata section and 0x20 for the +# .ctors/.dtors headers/ends). +# CHECK-NEXT: VirtualSize: 0x24 + +# Check that the weak symbols still are emitted as it was when the test was +# written, to make sure the test still actually tests what was intended. + +# SYMBOL: Symbol { +# SYMBOL: Name: foo +# SYMBOL-NEXT: Value: 0 +# SYMBOL-NEXT: Section: IMAGE_SYM_UNDEFINED (0) +# SYMBOL-NEXT: BaseType: Null (0x0) +# SYMBOL-NEXT: ComplexType: Null (0x0) +# SYMBOL-NEXT: StorageClass: WeakExternal (0x69) +# SYMBOL-NEXT: AuxSymbolCount: 1 +# SYMBOL-NEXT: AuxWeakExternal { +# SYMBOL-NEXT: Linked: .weak.foo.default.main (19) +# SYMBOL-NEXT: Search: Alias (0x3) +# SYMBOL-NEXT: } +# SYMBOL-NEXT: } + + .text + .globl main +main: + call foo + retq + +# See associative-comdat-mingw.s for the general setup. Here, the leader +# symbols are weak, which causes the functions foo and bar to be undefined +# weak externals, while the actual leader symbols are named like +# .weak.foo.default.main. + + .section .xdata$foo,"dr" + .linkonce discard + .long 42 + + .section .xdata$bar,"dr" + .linkonce discard + .long 43 + + .section .text$foo,"xr",discard,foo + .weak foo +foo: + ret + + .section .text$bar,"xr",discard,bar + .weak bar +bar: + ret diff --git a/lld/test/COFF/cgprofile-bad-clusters.s b/lld/test/COFF/cgprofile-bad-clusters.s new file mode 100644 index 0000000000000..12c05424095df --- /dev/null +++ b/lld/test/COFF/cgprofile-bad-clusters.s @@ -0,0 +1,61 @@ +# REQUIRES: x86 +# This test checks that CallGraphSort ignores edges that would form "bad" +# clusters. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: echo "A C 1" > %t.call_graph +# RUN: echo "E B 4" >> %t.call_graph +# RUN: echo "C D 2" >> %t.call_graph +# RUN: echo "B D 1" >> %t.call_graph +# RUN: echo "F G 6" >> %t.call_graph +# RUN: echo "G H 5" >> %t.call_graph +# RUN: echo "H I 4" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s + + .section .text,"ax",one_only,A + .globl A +A: + retq + + .section .text,"ax",one_only,D +D: + .fill 1000, 1, 0 + + .section .text,"ax",one_only,E +E: + retq + + .section .text,"ax",one_only,C +C: + retq + + .section .text,"ax",one_only,B +B: + .fill 1000, 1, 0 + + .section .text,"ax",one_only,F +F: + .fill (1024 * 1024) - 1, 1, 0 + + .section .text,"ax",one_only,G +G: + retq + + .section .text,"ax",one_only,H +H: + retq + + .section .text,"ax",one_only,I +I: + .fill 13, 1, 0 + +# CHECK: 140001000 t H +# CHECK: 140001001 t I +# CHECK: 14000100e T A +# CHECK: 14000100f t C +# CHECK: 140001010 t E +# CHECK: 140001011 t B +# CHECK: 1400013f9 t D +# CHECK: 1400017e1 t F +# CHECK: 1401017e0 t G diff --git a/lld/test/COFF/cgprofile-err.s b/lld/test/COFF/cgprofile-err.s new file mode 100644 index 0000000000000..94c1c2a688627 --- /dev/null +++ b/lld/test/COFF/cgprofile-err.s @@ -0,0 +1,11 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t + +# RUN: echo "A B C 100" > %t.call_graph +# RUN: not lld-link /dll /noentry /subsystem:console %t /call-graph-ordering-file:%t.call_graph /out:/dev/null 2>&1 | FileCheck %s + +# CHECK: {{.*}}.call_graph: parse error + +# RUN: echo "A B C" > %t.call_graph +# RUN: not lld-link /dll /noentry /subsystem:console %t /call-graph-ordering-file:%t.call_graph /out:/dev/null 2>&1 | FileCheck %s diff --git a/lld/test/COFF/cgprofile-icf.s b/lld/test/COFF/cgprofile-icf.s new file mode 100644 index 0000000000000..19cdd0f26e117 --- /dev/null +++ b/lld/test/COFF/cgprofile-icf.s @@ -0,0 +1,45 @@ +# REQUIRES: x86 +# Test the compatibility of ICF and cgprofile. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: echo "A B 100" > %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "C D 61" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab /opt:icf +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=NOICF + + .section .text,"x",one_only,D + .globl D +D: + mov $60, %rax + retq + + .section .text,"x",one_only,C + .globl C +C: + mov $60, %rax + retq + + .section .text,"x",one_only,B + .globl B +B: + mov $2, %rax + retq + + .section .text,"x",one_only,A + .globl A +A: + mov $42, %rax + retq + +# CHECK: 140001000 T A +# CHECK: 140001008 T C +# CHECK: 140001008 T D +# CHECK: 140001010 T B + +# NOICF: 140001000 T A +# NOICF: 140001008 T B +# NOICF: 140001010 T C +# NOICF: 140001018 T D diff --git a/lld/test/COFF/cgprofile-obj.s b/lld/test/COFF/cgprofile-obj.s new file mode 100644 index 0000000000000..b267850c46382 --- /dev/null +++ b/lld/test/COFF/cgprofile-obj.s @@ -0,0 +1,45 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: lld-link /subsystem:console /entry:A %t /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s +# RUN: lld-link /call-graph-profile-sort:no /subsystem:console /entry:A %t /out:%t3 /debug:symtab +# RUN: llvm-nm --numeric-sort %t3 | FileCheck %s --check-prefix=NO-CG + + .section .text,"ax", one_only, D +D: + retq + + .section .text,"ax", one_only, C + .globl C +C: + retq + + .section .text,"ax", one_only, B + .globl B +B: + retq + + .section .text,"ax", one_only, A + .globl A +A: +Aa: + retq + + .cg_profile A, B, 10 + .cg_profile A, B, 10 + .cg_profile Aa, B, 80 + .cg_profile A, C, 40 + .cg_profile B, C, 30 + .cg_profile C, D, 90 + +# CHECK: 140001000 T A +# CHECK: 140001001 T B +# CHECK: 140001002 T C +# CHECK: 140001003 t D + + +# NO-CG: 140001000 t D +# NO-CG: 140001001 T C +# NO-CG: 140001002 T B +# NO-CG: 140001003 T A diff --git a/lld/test/COFF/cgprofile-print.s b/lld/test/COFF/cgprofile-print.s new file mode 100644 index 0000000000000..e82185c0f30b1 --- /dev/null +++ b/lld/test/COFF/cgprofile-print.s @@ -0,0 +1,34 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: echo "A B 5" > %t.call_graph +# RUN: echo "B C 50" >> %t.call_graph +# RUN: echo "C D 40" >> %t.call_graph +# RUN: echo "D B 10" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /print-symbol-order:%t3 +# RUN: FileCheck %s --input-file %t3 + +# CHECK: B +# CHECK-NEXT: C +# CHECK-NEXT: D +# CHECK-NEXT: A + +.section .text, "x", one_only, A +.globl A +A: + nop + +.section .text, "x", one_only, B +.globl B +B: + nop + +.section .text, "x", one_only, C +.globl C +C: + nop + +.section .text, "x", one_only, D +.globl D +D: + nop diff --git a/lld/test/COFF/cgprofile-txt.s b/lld/test/COFF/cgprofile-txt.s new file mode 100644 index 0000000000000..49cade9dc52de --- /dev/null +++ b/lld/test/COFF/cgprofile-txt.s @@ -0,0 +1,43 @@ +# REQUIRES: x86 +# Test correctness of call graph ordering. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: lld-link /subsystem:console /entry:A %t /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=NOSORT + +# RUN: echo "A B 5" > %t.call_graph +# RUN: echo "B C 50" >> %t.call_graph +# RUN: echo "C D 40" >> %t.call_graph +# RUN: echo "D B 10" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s + +# NOSORT: 140001000 T A +# NOSORT: 140001001 T B +# NOSORT: 140001002 T C +# NOSORT: 140001003 T D + +# CHECK: 140001000 T B +# CHECK: 140001001 T C +# CHECK: 140001002 T D +# CHECK: 140001003 T A + +.section .text, "x", one_only, A +.globl A +A: + nop + +.section .text, "x", one_only, B +.globl B +B: + nop + +.section .text, "x", one_only, C +.globl C +C: + nop + +.section .text, "x", one_only, D +.globl D +D: + nop diff --git a/lld/test/COFF/noseh.s b/lld/test/COFF/noseh.s new file mode 100644 index 0000000000000..442952286229c --- /dev/null +++ b/lld/test/COFF/noseh.s @@ -0,0 +1,19 @@ +# REQUIRES: x86 +# RUN: llvm-mc -triple i686-w64-mingw32 %s -filetype=obj -o %t.obj +# RUN: lld-link -lldmingw %t.obj -out:%t.exe -entry:main +# RUN: llvm-readobj --file-headers %t.exe | FileCheck %s --check-prefix=DEFAULT +# RUN: lld-link -lldmingw %t.obj -out:%t.noseh.exe -entry:main -noseh +# RUN: llvm-readobj --file-headers %t.noseh.exe | FileCheck %s --check-prefix=NOSEH + +# DEFAULT: Characteristics [ +# DEFAULT-NOT: IMAGE_DLL_CHARACTERISTICS_NO_SEH +# DEFAULT: ] + +# NOSEH: Characteristics [ +# NOSEH: IMAGE_DLL_CHARACTERISTICS_NO_SEH +# NOSEH: ] + + .text + .globl _main +_main: + ret diff --git a/lld/test/ELF/bb-sections-and-icf.s b/lld/test/ELF/basic-block-sections-and-icf.s similarity index 96% rename from lld/test/ELF/bb-sections-and-icf.s rename to lld/test/ELF/basic-block-sections-and-icf.s index bcc9193c6ed8f..998d63b47b01c 100644 --- a/lld/test/ELF/bb-sections-and-icf.s +++ b/lld/test/ELF/basic-block-sections-and-icf.s @@ -1,5 +1,5 @@ # REQUIRES: x86 -## basicblock-sections tests. +## basic-block-sections tests. ## This simple test checks foo is folded into bar with bb sections ## and the jumps are deleted. diff --git a/lld/test/ELF/bb-sections-delete-fallthru.s b/lld/test/ELF/basic-block-sections-delete-fallthru.s similarity index 98% rename from lld/test/ELF/bb-sections-delete-fallthru.s rename to lld/test/ELF/basic-block-sections-delete-fallthru.s index c8a0e93534242..b029e664a1e12 100644 --- a/lld/test/ELF/bb-sections-delete-fallthru.s +++ b/lld/test/ELF/basic-block-sections-delete-fallthru.s @@ -1,5 +1,5 @@ # REQUIRES: x86 -## basicblock-sections tests. +## basic-block-sections tests. ## This simple test checks if redundant direct jumps are converted to ## implicit fallthrus. The jcc's must be converted to their inverted ## opcode, for instance jne to je and jmp must be deleted. diff --git a/lld/test/ELF/bb-sections-pc32reloc.s b/lld/test/ELF/basic-block-sections-pc32reloc.s similarity index 96% rename from lld/test/ELF/bb-sections-pc32reloc.s rename to lld/test/ELF/basic-block-sections-pc32reloc.s index 9631a3cfe6c3a..f276cd9fac731 100644 --- a/lld/test/ELF/bb-sections-pc32reloc.s +++ b/lld/test/ELF/basic-block-sections-pc32reloc.s @@ -1,5 +1,5 @@ # REQUIRES: x86 -## basicblock-sections tests. +## basic-block-sections tests. ## This simple test checks if redundant direct jumps are converted to ## implicit fallthrus when PC32 reloc is present. The jcc's must be converted ## to their inverted opcode, for instance jne to je and jmp must be deleted. diff --git a/lld/test/ELF/dependency-file.s b/lld/test/ELF/dependency-file.s new file mode 100644 index 0000000000000..e7dbf9c7695f7 --- /dev/null +++ b/lld/test/ELF/dependency-file.s @@ -0,0 +1,21 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t/foo.o +# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o "%t/bar baz.o" +# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o "%t/#quux$.o" +# RUN: ld.lld -o %t/foo.exe %t/foo.o %t/"bar baz.o" "%t/#quux$.o" --dependency-file=%t/foo.d +# RUN: FileCheck --match-full-lines -DFILE=%t %s < %t/foo.d + +# CHECK: [[FILE]]{{/|(\\)+}}foo.exe: \ +# CHECK-NEXT: [[FILE]]{{/|(\\)+}}foo.o \ +# CHECK-NEXT: [[FILE]]{{/|(\\)+}}bar\ baz.o \ +# CHECK-NEXT: [[FILE]]{{/|(\\)+}}\#quux$$.o +# CHECK-EMPTY: +# CHECK-NEXT: [[FILE]]{{/|(\\)+}}foo.o: +# CHECK-EMPTY: +# CHECK-NEXT: [[FILE]]{{/|(\\)+}}bar\ baz.o: +# CHECK-EMPTY: +# CHECK-NEXT: [[FILE]]{{/|(\\)+}}\#quux$$.o: + +.global _start +_start: diff --git a/lld/test/ELF/linkerscript/Inputs/define.s b/lld/test/ELF/linkerscript/Inputs/define.s index bc60a233dcb4b..69f47a8803164 100644 --- a/lld/test/ELF/linkerscript/Inputs/define.s +++ b/lld/test/ELF/linkerscript/Inputs/define.s @@ -6,3 +6,6 @@ defined = 0 .section .bar,"a" .quad 1 + +.section .test,"a" +.quad 1 diff --git a/lld/test/ELF/linkerscript/define.test b/lld/test/ELF/linkerscript/define.test index 689476ba32ad0..3ecaa11cc5b69 100644 --- a/lld/test/ELF/linkerscript/define.test +++ b/lld/test/ELF/linkerscript/define.test @@ -3,13 +3,17 @@ # RUN: ld.lld -o %t --script %s %t.o # RUN: llvm-objdump --section-headers %t | FileCheck %s +EXTERN(extern_defined) SECTIONS { . = DEFINED(defined) ? 0x11000 : .; .foo : { *(.foo*) } . = DEFINED(notdefined) ? 0x12000 : 0x13000; .bar : { *(.bar*) } + . = DEFINED(extern_defined) ? 0x14000 : 0x15000; + .test : { *(.test*) } } # CHECK: 1 .foo 00000008 0000000000011000 DATA # CHECK: 2 .bar 00000008 0000000000013000 DATA -# CHECK: 3 .text 00000000 0000000000013008 TEXT +# CHECK: 3 .test 00000008 0000000000015000 DATA +# CHECK: 4 .text 00000000 0000000000015008 TEXT diff --git a/lld/test/ELF/linkerscript/noload.s b/lld/test/ELF/linkerscript/noload.s index c2014722985d3..20b07b2b185a2 100644 --- a/lld/test/ELF/linkerscript/noload.s +++ b/lld/test/ELF/linkerscript/noload.s @@ -1,8 +1,8 @@ # REQUIRES: x86 -# RUN: extract asm %s -o %t.s && extract lds %s -o %t.lds -# RUN: llvm-mc -filetype=obj -triple=x86_64 %t.s -o %t.o -# RUN: ld.lld -o %t --script %t.lds %t.o -# RUN: llvm-readelf -S -l %t | FileCheck %s +# RUN: split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/asm -o %t.o +# RUN: ld.lld --script %t/lds %t.o -o %t/out +# RUN: llvm-readelf -S -l %t/out | FileCheck %s # CHECK: Name Type Address Off Size # CHECK: .data_noload_a NOBITS 0000000000000000 [[OFF:[0-9a-f]+]] 001000 diff --git a/lld/test/ELF/linkerscript/operators.test b/lld/test/ELF/linkerscript/operators.test index 1d40c81321d96..8ba8ee7ea41dc 100644 --- a/lld/test/ELF/linkerscript/operators.test +++ b/lld/test/ELF/linkerscript/operators.test @@ -38,6 +38,14 @@ SECTIONS { minus_abs = _end - _start; max = MAX(11, 22); min = MIN(11, 22); + log2ceil0 = LOG2CEIL(0); + log2ceil1 = LOG2CEIL(1); + log2ceil2 = LOG2CEIL(2); + log2ceil3 = LOG2CEIL(3); + log2ceil4 = LOG2CEIL(4); + log2ceil100000000 = LOG2CEIL(0x100000000); + log2ceil100000001 = LOG2CEIL(0x100000001); + log2ceilmax = LOG2CEIL(0xffffffffffffffff); logicaland1 = 0 && 0; logicaland2 = 0 && 1; logicaland3 = 1 && 0; @@ -78,6 +86,14 @@ SECTIONS { # CHECK-NEXT: 0000000000fff0 A minus_abs # CHECK-NEXT: 00000000000016 A max # CHECK-NEXT: 0000000000000b A min +# CHECK-NEXT: 0000000000000000 A log2ceil0 +# CHECK-NEXT: 0000000000000000 A log2ceil1 +# CHECK-NEXT: 0000000000000001 A log2ceil2 +# CHECK-NEXT: 0000000000000002 A log2ceil3 +# CHECK-NEXT: 0000000000000002 A log2ceil4 +# CHECK-NEXT: 0000000000000020 A log2ceil100000000 +# CHECK-NEXT: 0000000000000021 A log2ceil100000001 +# CHECK-NEXT: 0000000000000040 A log2ceilmax # CHECK-NEXT: 00000000000000 A logicaland1 # CHECK-NEXT: 00000000000000 A logicaland2 # CHECK-NEXT: 00000000000000 A logicaland3 diff --git a/lld/test/ELF/lto/basic-block-sections.ll b/lld/test/ELF/lto/basic-block-sections.ll new file mode 100644 index 0000000000000..1f932ac50a879 --- /dev/null +++ b/lld/test/ELF/lto/basic-block-sections.ll @@ -0,0 +1,41 @@ +; REQUIRES: x86 +; RUN: llvm-as %s -o %t.o +; RUN: ld.lld %t.o -o %t --lto-basic-block-sections=all --lto-O0 --save-temps +; RUN: llvm-readobj -s %t.lto.o | FileCheck --check-prefix=SECNAMES %s +; RUN: ld.lld %t.o -o %t --lto-basic-block-sections=all --lto-unique-basic-block-section-names --lto-O0 --save-temps +; RUN: llvm-readobj -s %t.lto.o | FileCheck --check-prefix=SECNAMES-FULL %s +; RUN: llvm-nm %t | FileCheck --check-prefix=SYMS %s + +; SECNAMES: Name: .text.foo {{.*}} +; SECNAMES: Name: .text.foo {{.*}} +; SECNAMES: Name: .text.foo {{.*}} + +; SECNAMES-FULL: Name: .text.foo {{.*}} +; SECNAMES-FULL: Name: .text.foo.foo.1 {{.*}} +; SECNAMES-FULL: Name: .text.foo.foo.2 {{.*}} + +; SYMS: foo +; SYMS: foo.1 +; SYMS: foo.2 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo(i32 %b) local_unnamed_addr { +entry: + %tobool.not = icmp eq i32 %b, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: ; preds = %entry + tail call void @foo(i32 0) + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +define void @_start() { + call void @foo(i32 1) + ret void +} diff --git a/lld/test/ELF/ppc64-pcrel-call-to-extern.s b/lld/test/ELF/ppc64-pcrel-call-to-extern.s new file mode 100644 index 0000000000000..ab2eaf09a6acc --- /dev/null +++ b/lld/test/ELF/ppc64-pcrel-call-to-extern.s @@ -0,0 +1,131 @@ +# REQUIRES: ppc +# RUN: echo 'SECTIONS { \ +# RUN: .text_caller1 0x10010000 : { *(.text_caller1) } \ +# RUN: .text_caller2 0x10020000 : { *(.text_caller2) } \ +# RUN: .text_caller3 0x10030000 : { *(.text_caller3) } \ +# RUN: }' > %t.script + +# RUN: llvm-mc -filetype=obj -triple=powerpc64le --defsym AUX=1 %s -o %t1.o +# RUN: llvm-mc -filetype=obj -triple=powerpc64le %s -o %t2.o +# RUN: ld.lld --shared %t2.o -o %t2.so +# RUN: ld.lld -T %t.script %t1.o %t2.so -o %t +# RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=SYMBOL +# RUN: llvm-readelf -S -d %t | FileCheck %s --check-prefix=SEC +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=REL +# RUN: llvm-objdump -d --no-show-raw-insn --mcpu=pwr10 %t | FileCheck %s + +# RUN: llvm-mc -filetype=obj -triple=powerpc64 --defsym AUX=1 %s -o %t1.o +# RUN: llvm-mc -filetype=obj -triple=powerpc64 %s -o %t2.o +# RUN: ld.lld --shared %t2.o -o %t2.so +# RUN: ld.lld -T %t.script %t1.o %t2.so -o %t +# RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=SYMBOL +# RUN: llvm-readelf -S -d %t | FileCheck %s --check-prefix=SEC +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=REL +# RUN: llvm-objdump -d --no-show-raw-insn --mcpu=pwr10 %t | FileCheck %s + +## The test is created to check that when a function without TOC access an +## external function, a r12 setup stub is inserted. + +# SYMBOL: Symbol table '.dynsym' contains 4 entries: +# SYMBOL: 1: 0000000000000000 0 NOTYPE GLOBAL DEFAULT [] UND callee_global_TOC +# SYMBOL-NEXT: 2: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND callee_global_stother0 +# SYMBOL-NEXT: 3: 0000000000000000 0 NOTYPE GLOBAL DEFAULT [] UND callee_global_stother1 + +# SYMBOL: Symbol table '.symtab' contains 12 entries: +# SYMBOL: 2: 0000000010010000 0 NOTYPE LOCAL DEFAULT [] 6 caller1 +# SYMBOL-NEXT: 3: 0000000010020000 0 NOTYPE LOCAL DEFAULT [] 7 caller2 +# SYMBOL-NEXT: 4: 0000000010030000 0 NOTYPE LOCAL DEFAULT [] 8 caller3 +# SYMBOL: 6: 0000000010010008 16 FUNC LOCAL DEFAULT 6 __plt_pcrel_callee_global_stother0 +# SYMBOL-NEXT: 7: 0000000010020008 16 FUNC LOCAL DEFAULT 7 __plt_pcrel_callee_global_stother1 +# SYMBOL-NEXT: 8: 0000000010030008 16 FUNC LOCAL DEFAULT 8 __plt_pcrel_callee_global_TOC +# SYMBOL-NEXT: 9: 0000000000000000 0 NOTYPE GLOBAL DEFAULT [] UND callee_global_TOC +# SYMBOL-NEXT: 10: 0000000000000000 0 NOTYPE GLOBAL DEFAULT UND callee_global_stother0 +# SYMBOL-NEXT: 11: 0000000000000000 0 NOTYPE GLOBAL DEFAULT [] UND callee_global_stother1 + +## DT_PLTGOT points to .plt +# SEC: .plt NOBITS 0000000010030140 040140 000028 00 WA 0 0 8 +# SEC: 0x0000000000000003 (PLTGOT) 0x10030140 + +## The first 2 entries in the .plt are reserved for the dynamic linkers +## usage. The JMP_SLOT relocations are stored at .plt[2], .plt[3], .plt[4]. +## Check that we emit 3 R_PPC64_JMP_SLOT in .rela.plt. +# REL: .rela.plt { +# REL-NEXT: 0x10030150 R_PPC64_JMP_SLOT callee_global_stother0 0x0 +# REL-NEXT: 0x10030158 R_PPC64_JMP_SLOT callee_global_stother1 0x0 +# REL-NEXT: 0x10030160 R_PPC64_JMP_SLOT callee_global_TOC 0x0 +# REL-NEXT: } + +# CHECK-LABEL: : +# CHECK: 10010000: bl 0x10010008 +# CHECK-NEXT: 10010004: blr + +## .plt[2] - 0x10010008 = 0x10030150 - 0x10010008 = 0x20148 = 131400 +# CHECK-LABEL: <__plt_pcrel_callee_global_stother0>: +# CHECK: 10010008: pld 12, 131400(0), 1 +# CHECK-NEXT: 10010010: mtctr 12 +# CHECK-NEXT: 10010014: bctr + +# CHECK-LABEL: : +# CHECK: 10020000: bl 0x10020008 +# CHECK-NEXT: 10020004: blr + +## .plt[3] - 0x10020008 = 0x10030158 - 0x10020008 = 0x10150 = 65872 +# CHECK-LABEL: <__plt_pcrel_callee_global_stother1>: +# CHECK: 10020008: pld 12, 65872(0), 1 +# CHECK-NEXT: 10020010: mtctr 12 +# CHECK-NEXT: 10020014: bctr + +# CHECK-LABEL: : +# CHECK: 10030000: bl 0x10030008 +# CHECK-NEXT: 10030004: blr + +## .plt[4] - 0x10030008 = 0x10030160 - 0x10030008 = 0x158 = 344 +# CHECK-LABEL: <__plt_pcrel_callee_global_TOC>: +# CHECK: 10030008: pld 12, 344(0), 1 +# CHECK-NEXT: 10030010: mtctr 12 +# CHECK-NEXT: 10030014: bctr + +.ifdef AUX +.section .text_caller1, "ax", %progbits +caller1: + .localentry caller1, 1 + bl callee_global_stother0@notoc + blr +.section .text_caller2, "ax", %progbits +caller2: + .localentry caller2, 1 + bl callee_global_stother1@notoc + blr + +.section .text_caller3, "ax", %progbits +caller3: + .localentry caller3, 1 + bl callee_global_TOC@notoc + blr + +.else +func_extern: + blr +.globl callee_global_stother0 +callee_global_stother0: + blr +.globl callee_global_stother1 +callee_global_stother1: + .localentry callee_global_stother1, 1 + ## nop is not needed after bl for R_PPC64_REL24_NOTOC + bl func_extern@notoc + blr +.globl callee_global_TOC +callee_global_TOC: +.Lfunc_gep1: + addis 2, 12, .TOC.-.Lfunc_gep1@ha + addi 2, 2, .TOC.-.Lfunc_gep1@l +.Lfunc_lep1: + .localentry callee_global_TOC, .Lfunc_lep1-.Lfunc_gep1 + addis 4, 2, global@toc@ha + lwz 3, global@toc@l(4) + blr +global: + .long 0 + .size global, 4 +.endif diff --git a/lld/test/ELF/ppc64-reloc-pcrel34-overflow.s b/lld/test/ELF/ppc64-reloc-pcrel34-overflow.s index a97160a430dd0..ad80ed720b637 100644 --- a/lld/test/ELF/ppc64-reloc-pcrel34-overflow.s +++ b/lld/test/ELF/ppc64-reloc-pcrel34-overflow.s @@ -1,21 +1,22 @@ # REQUIRES: ppc # RUN: echo 'SECTIONS { \ -# RUN: .text_low 0x10010000: { *(.text_low) } \ -# RUN: .text_overflow 0x1000000000 : { *(.text_overflow) } \ -# RUN: }' > %t.script +# RUN: .text 0x10000: { *(.text) } \ +# RUN: .data 0x200010000 : { *(.data) } \ +# RUN: }' > %t.script # RUN: llvm-mc -filetype=obj -triple=powerpc64le %s -o %t.o -# RUN: not ld.lld -T %t.script %t.o -o %t +# RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck %s # RUN: llvm-mc -filetype=obj -triple=powerpc64 %s -o %t.o -# RUN: not ld.lld -T %t.script %t.o -o %t +# RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck %s -.section .text_low, "ax", %progbits -# CHECK: relocation R_PPC64_PCREL34 out of range -GlobIntOverflow: +# CHECK: relocation R_PPC64_PCREL34 out of range: 8589934592 is not in [-8589934592, 8589934591] plwa 3, glob_overflow@PCREL(0), 1 - blr -.section .text_overflow, "ax", %progbits + +# CHECK-NOT: relocation + plwa 3, .data@PCREL(0), 1 + +.data glob_overflow: .long 0 .size glob_overflow, 4 diff --git a/lld/test/ELF/reproduce-lto.s b/lld/test/ELF/reproduce-lto.s new file mode 100644 index 0000000000000..e7c3ece61085d --- /dev/null +++ b/lld/test/ELF/reproduce-lto.s @@ -0,0 +1,10 @@ +# REQUIRES: x86 + +# RUN: rm -rf %t.dir +# RUN: mkdir -p %t.dir/build1 +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.dir/build1/foo.o +# RUN: echo > %t.dir/build1/empty_profile.txt +# RUN: cd %t.dir +# RUN: ld.lld build1/foo.o -o /dev/null --reproduce repro1.tar --lto-sample-profile=%t.dir/build1/empty_profile.txt +# RUN: tar tvf repro1.tar | FileCheck %s +# CHECK: repro1/{{.*}}/empty_profile.txt diff --git a/lld/test/ELF/wrap-dynamic-undef.s b/lld/test/ELF/wrap-dynamic-undef.s index 2abb826411b39..af2871cfe6eae 100644 --- a/lld/test/ELF/wrap-dynamic-undef.s +++ b/lld/test/ELF/wrap-dynamic-undef.s @@ -8,10 +8,9 @@ # Test that the dynamic relocation uses foo. We used to produce a # relocation with __real_foo. -# CHECK: Symbol table '.dynsym' contains 3 entries: +# CHECK: Symbol table '.dynsym' contains 2 entries: # CHECK: NOTYPE LOCAL DEFAULT UND # CHECK-NEXT: NOTYPE GLOBAL DEFAULT UND foo -# CHECK-NEXT: NOTYPE GLOBAL DEFAULT UND __wrap_foo .global _start _start: diff --git a/lld/test/ELF/wrap-plt.s b/lld/test/ELF/wrap-plt.s index 2584d6ac09bb1..57b417e7b2f0c 100644 --- a/lld/test/ELF/wrap-plt.s +++ b/lld/test/ELF/wrap-plt.s @@ -11,6 +11,7 @@ // CHECK: Relocations [ // CHECK-NEXT: Section ({{.*}}) .rela.plt { // CHECK-NEXT: R_X86_64_JUMP_SLOT __wrap_foo 0x0 +// CHECK-NEXT: R_X86_64_JUMP_SLOT foo 0x0 // CHECK-NEXT: R_X86_64_JUMP_SLOT _start 0x0 // CHECK-NEXT: } // CHECK-NEXT: ] @@ -18,6 +19,7 @@ // DISASM: <_start>: // DISASM-NEXT: jmp {{.*}} <__wrap_foo@plt> // DISASM-NEXT: jmp {{.*}} <__wrap_foo@plt> +// DISASM-NEXT: jmp {{.*}} // DISASM-NEXT: jmp {{.*}} <_start@plt> .global foo @@ -32,4 +34,5 @@ __wrap_foo: _start: jmp foo@plt jmp __wrap_foo@plt + jmp __real_foo@plt jmp _start@plt diff --git a/lld/test/ELF/wrap-shlib-undefined.s b/lld/test/ELF/wrap-shlib-undefined.s index 8bbda963f0702..b0451b2865abe 100644 --- a/lld/test/ELF/wrap-shlib-undefined.s +++ b/lld/test/ELF/wrap-shlib-undefined.s @@ -12,12 +12,11 @@ # RUN: ld.lld %t.o %t.so --wrap=foo -o %t # RUN: llvm-readelf --dyn-syms %t | FileCheck %s -## FIXME GNU ld does not export __wrap_foo ## The reference __real_foo from %t.so causes foo to be exported. -# CHECK: Symbol table '.dynsym' contains 4 entries: +## __wrap_foo is not used, thus not exported. +# CHECK: Symbol table '.dynsym' contains 3 entries: # CHECK: NOTYPE LOCAL DEFAULT UND # CHECK-NEXT: NOTYPE GLOBAL DEFAULT UND bar -# CHECK-NEXT: NOTYPE GLOBAL DEFAULT UND __wrap_foo # CHECK-NEXT: NOTYPE GLOBAL DEFAULT 6 foo .globl _start, foo diff --git a/lld/test/MachO/dso-handle.s b/lld/test/MachO/dso-handle.s new file mode 100644 index 0000000000000..f57ec7260fe06 --- /dev/null +++ b/lld/test/MachO/dso-handle.s @@ -0,0 +1,16 @@ +# REQUIRES: x86 +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o + +# RUN: lld -flavor darwinnew %t.o -o %t +# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s +# CHECK: leaq {{.*}} # 100000000 + +# RUN: lld -flavor darwinnew -dylib %t.o -o %t.dylib +# RUN: llvm-objdump -d --no-show-raw-insn %t.dylib | FileCheck %s --check-prefix=DYLIB-CHECK +# DYLIB-CHECK: leaq {{.*}} # 0 + +.globl _main +.text +_main: + leaq ___dso_handle(%rip), %rdx + ret diff --git a/lld/test/MachO/filelist.s b/lld/test/MachO/filelist.s new file mode 100644 index 0000000000000..673740ebf06df --- /dev/null +++ b/lld/test/MachO/filelist.s @@ -0,0 +1,40 @@ +# REQUIRES: x86 + +## This test verifies that the paths in -filelist get processed in command-line +## order. + +# RUN: mkdir -p %t +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,first; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/first.o +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,second; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/second.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o + +# FIRST: __TEXT,first _foo +# SECOND: __TEXT,second _foo + +# RUN: echo "%t/first.o" > filelist +# RUN: echo "%t/second.o" >> filelist +# RUN: lld -flavor darwinnew -Z -filelist filelist %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=FIRST + +# RUN: echo "%t/second.o" > filelist +# RUN: echo "%t/first.o" >> filelist +# RUN: lld -flavor darwinnew -Z -filelist filelist %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=SECOND + +# RUN: echo "%t/first.o" > filelist +# RUN: lld -flavor darwinnew -Z -filelist filelist %t/second.o %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=FIRST +# RUN: lld -flavor darwinnew -Z %t/second.o -filelist filelist %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=SECOND + +# RUN: echo "%t/first.o" > filelist-1 +# RUN: echo "%t/second.o" > filelist-2 +# RUN: lld -flavor darwinnew -Z -filelist filelist-1 -filelist filelist-2 %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=FIRST +# RUN: lld -flavor darwinnew -Z -filelist filelist-2 -filelist filelist-1 %t/test.o -o %t/test +# RUN: llvm-objdump --syms %t/test | FileCheck %s --check-prefix=SECOND + +.globl _main + +_main: + ret diff --git a/lld/test/MachO/framework.s b/lld/test/MachO/framework.s new file mode 100644 index 0000000000000..a527970ed6bc9 --- /dev/null +++ b/lld/test/MachO/framework.s @@ -0,0 +1,29 @@ +# REQUIRES: x86, shell +# RUN: mkdir -p %t +# RUN: echo ".globl _foo; _foo: ret" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/foo.o +# RUN: mkdir -p %t/Foo.framework/Versions/A +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -dylib -install_name %t/Foo.framework/Versions/A/Foo %t/foo.o -o %t/Foo.framework/Versions/A/Foo +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -dylib -install_name %t/Foo.framework/Versions/A/Foobar %t/foo.o -o %t/Foo.framework/Versions/A/Foobar +# RUN: ln -sf %t/Foo.framework/Versions/A %t/Foo.framework/Versions/Current +# RUN: ln -sf %t/Foo.framework/Versions/Current/Foo %t/Foo.framework/Foo + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/test.o %s +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -F%t -framework Foo %t/test.o -o %t/test +# RUN: llvm-objdump --macho --lazy-bind %t/test | FileCheck %s --check-prefix=NOSUFFIX +# NOSUFFIX: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} {{.*}}Foo _foo + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -F%t -framework Foo,baz %t/test.o -o %t/test-wrong-suffix +# RUN: llvm-objdump --macho --lazy-bind %t/test-wrong-suffix | FileCheck %s --check-prefix=NOSUFFIX + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -F%t -framework Foo,bar %t/test.o -o %t/test-suffix +# RUN: llvm-objdump --macho --lazy-bind %t/test-suffix | FileCheck %s --check-prefix=SUFFIX +# SUFFIX: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} {{.*}}Foobar _foo + +.globl _main +.text +_main: + sub $8, %rsp # 16-byte-align the stack; dylld -flavor darwinnew checks for this + callq _foo + mov $0, %rax + add $8, %rsp + ret diff --git a/lld/test/MachO/headerpad.s b/lld/test/MachO/headerpad.s new file mode 100644 index 0000000000000..963ccf9ce4e70 --- /dev/null +++ b/lld/test/MachO/headerpad.s @@ -0,0 +1,34 @@ +# REQUIRES: x86 + +## NOTE: Here we check that the first non-header section -- __text -- appears +## *exactly* `-headerpad` bytes from the end of the header. ld64 actually +## starts laying out the non-header sections in the __TEXT segment from the end +## of the (page-aligned) segment rather than the front, so its binaries +## typically have more than `-headerpad` bytes of actual padding. `-headerpad` +## just enforces a lower bound. We should consider implementing the same +## alignment behavior. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: lld -flavor darwinnew -o %t %t.o -headerpad 0 +# RUN: llvm-objdump --macho --all-headers %t | FileCheck %s --check-prefix=PAD0 +# PAD0: magic cputype cpusubtype caps filetype ncmds sizeofcmds +# PAD0-NEXT: MH_MAGIC_64 X86_64 ALL LIB64 EXECUTE 8 [[#%u, CMDSIZE:]] {{.*}} +# PAD0: sectname __text +# PAD0-NEXT: segname __TEXT +# PAD0-NEXT: addr +# PAD0-NEXT: size +# PAD0-NEXT: offset [[#%u, CMDSIZE + 32]] + +# RUN: lld -flavor darwinnew -o %t %t.o -headerpad 11 +# RUN: llvm-objdump --macho --all-headers %t | FileCheck %s --check-prefix=PAD11 +# PAD11: magic cputype cpusubtype caps filetype ncmds sizeofcmds +# PAD11-NEXT: MH_MAGIC_64 X86_64 ALL LIB64 EXECUTE 8 [[#%u, CMDSIZE:]] {{.*}} +# PAD11: sectname __text +# PAD11-NEXT: segname __TEXT +# PAD11-NEXT: addr +# PAD11-NEXT: size +# PAD11-NEXT: offset [[#%u, CMDSIZE + 32 + 0x11]] + +.globl _main +_main: + ret diff --git a/lld/test/MachO/invalid/dso-handle-duplicate.s b/lld/test/MachO/invalid/dso-handle-duplicate.s new file mode 100644 index 0000000000000..5991c6faff882 --- /dev/null +++ b/lld/test/MachO/invalid/dso-handle-duplicate.s @@ -0,0 +1,20 @@ +# REQUIRES: x86 + +## If for some bizarre reason the input file defines its own ___dso_handle, we +## should raise an error. At least, we've implemented this behavior if the +## conflicting symbol is a global. A local symbol of the same name will still +## take priority in our implementation, unlike in ld64. But that's a pretty +## far-out edge case that should be safe to ignore. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: not lld -flavor darwinnew -dylib %t.o -o %t.dylib 2>&1 | FileCheck %s -DFILE=%t.o +# CHECK: error: found defined symbol from [[FILE]] with illegal name ___dso_handle + +.globl _main, ___dso_handle +.text +_main: + leaq ___dso_handle(%rip), %rdx + ret + +___dso_handle: + .space 1 diff --git a/lld/test/MachO/invalid/no-filelist.s b/lld/test/MachO/invalid/no-filelist.s new file mode 100644 index 0000000000000..980814cf6eeb1 --- /dev/null +++ b/lld/test/MachO/invalid/no-filelist.s @@ -0,0 +1,9 @@ +# REQUIRES: x86 +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: not lld -flavor darwinnew -Z -filelist nonexistent %t.o -o %t 2>&1 | FileCheck %s +# CHECK: cannot open nonexistent: {{N|n}}o such file or directory + +.globl _main + +_main: + ret diff --git a/lld/test/MachO/linkedit-contiguity.s b/lld/test/MachO/linkedit-contiguity.s new file mode 100644 index 0000000000000..f815e5555a859 --- /dev/null +++ b/lld/test/MachO/linkedit-contiguity.s @@ -0,0 +1,41 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## codesign requires that each setion in __LINKEDIT ends where the next one +## starts. This test enforces that invariant. +## TODO: Test other __LINKEDIT sections here as support for them gets added. +## Examples of such sections include the data for LC_CODE_SIGNATURE and +## LC_DATA_IN_CODE. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %p/Inputs/libhello.s \ +# RUN: -o %t/libhello.o +# RUN: lld -flavor darwinnew -dylib -L%S/Inputs/MacOSX.sdk/usr/lib \ +# RUN: -install_name @executable_path/libhello.dylib %t/libhello.o \ +# RUN: -o %t/libhello.dylib + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: lld -flavor darwinnew -o %t/test \ +# RUN: -L%S/Inputs/MacOSX.sdk/usr/lib -L%t -lhello %t/test.o -lSystem + +# RUN: llvm-objdump --macho --all-headers %t/test | FileCheck %s + +# CHECK: cmd LC_DYLD_INFO_ONLY +# CHECK-NEXT: cmdsize 48 +# CHECK-NEXT: rebase_off 0 +# CHECK-NEXT: rebase_size 0 +# CHECK-NEXT: bind_off [[#BIND_OFF:]] +# CHECK-NEXT: bind_size [[#BIND_SIZE:]] +# CHECK-NEXT: weak_bind_off 0 +# CHECK-NEXT: weak_bind_size 0 +# CHECK-NEXT: lazy_bind_off [[#LAZY_OFF: BIND_OFF + BIND_SIZE]] +# CHECK-NEXT: lazy_bind_size [[#LAZY_SIZE:]] +# CHECK-NEXT: export_off [[#EXPORT_OFF: LAZY_OFF + LAZY_SIZE]] +# CHECK-NEXT: export_size [[#]] + +.text +.globl _main +_main: + sub $8, %rsp # 16-byte-align the stack; dyld checks for this + callq _print_hello + add $8, %rsp + ret diff --git a/lld/test/MachO/local-got.s b/lld/test/MachO/local-got.s index 6099a6bf18c7b..baf4b3336b7ab 100644 --- a/lld/test/MachO/local-got.s +++ b/lld/test/MachO/local-got.s @@ -12,12 +12,12 @@ ## address offset and the contents at that address very similarly, so am using ## --match-full-lines to make sure we match on the right thing. # CHECK: Contents of section __cstring: -# CHECK-NEXT: 1000003cc {{.*}} +# CHECK-NEXT: 1000003ec {{.*}} ## 1st 8 bytes refer to the start of __cstring + 0xe, 2nd 8 bytes refer to the ## start of __cstring # CHECK: Contents of section __got: -# CHECK-NEXT: [[#%X,ADDR:]] da030000 01000000 cc030000 01000000 {{.*}} +# CHECK-NEXT: [[#%X,ADDR:]] fa030000 01000000 ec030000 01000000 {{.*}} # CHECK-NEXT: [[#ADDR + 16]] 00000000 00000000 {{.*}} ## Check that a non-locally-defined symbol is still bound at the correct offset: diff --git a/lld/test/MachO/relocations.s b/lld/test/MachO/relocations.s index edaa308935608..93be4998db68f 100644 --- a/lld/test/MachO/relocations.s +++ b/lld/test/MachO/relocations.s @@ -21,7 +21,7 @@ # RUN: llvm-objdump --section=__const --full-contents -d %t | FileCheck %s --check-prefix=NONPCREL # NONPCREL: Contents of section __const: -# NONPCREL-NEXT: 100001000 b0030000 01000000 b0030000 01000000 +# NONPCREL-NEXT: 100001000 d0030000 01000000 d0030000 01000000 .section __TEXT,__text .globl _main, _f diff --git a/lld/test/MachO/section-headers.s b/lld/test/MachO/section-headers.s index 9fafc5a912b0f..fdfdbed632450 100644 --- a/lld/test/MachO/section-headers.s +++ b/lld/test/MachO/section-headers.s @@ -1,7 +1,7 @@ # REQUIRES: x86 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o # RUN: lld -flavor darwinnew -o %t %t.o -# RUN: llvm-readobj --section-headers %t | FileCheck %s +# RUN: llvm-readobj --section-headers --macho-segment %t | FileCheck %s # CHECK: Name: __text # CHECK-NEXT: Segment: __TEXT @@ -25,11 +25,21 @@ # CHECK: Name: maxlen_16ch_name # CHECK-NEXT: Segment: __TEXT -# CHECK-NOT: } -# CHECK: Alignment: 3 +# CHECK-NEXT: Address: +# CHECK-NEXT: Size: [[#%x, LAST_SEC_SIZE:]] +# CHECK-NEXT: Offset: [[#%u, LAST_SEC_OFF:]] +# CHECK-NEXT: Alignment: 3 # CHECK-NOT: } # CHECK: Type: Regular (0x0) +# CHECK-LABEL: Segment { +# CHECK: Name: __TEXT +# CHECK-NEXT: Size: +# CHECK-NEXT: vmaddr: +# CHECK-NEXT: vmsize: +# CHECK-NEXT: fileoff: 0 +# CHECK-NEXT: filesize: [[#%u, LAST_SEC_SIZE + LAST_SEC_OFF]] + .text .align 1 .global _main diff --git a/lld/test/MachO/segments.s b/lld/test/MachO/segments.s index acb0f1e90101a..e0f127fabe55f 100644 --- a/lld/test/MachO/segments.s +++ b/lld/test/MachO/segments.s @@ -1,49 +1,58 @@ -# REQUIRES: x86 +# REQUIRES: x86, shell # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o # RUN: lld -flavor darwinnew -o %t %t.o -# RUN: llvm-readobj --macho-segment %t | FileCheck %s +# RUN: (llvm-readobj --macho-segment %t; echo "Total file size"; wc -c %t) | FileCheck %s ## These two segments must always be present at the start of an executable. # CHECK-NOT: Segment { # CHECK: Segment { -# CHECK: Cmd: LC_SEGMENT_64 -# CHECK: Name: __PAGEZERO -# CHECK: Size: 72 -# CHECK: vmaddr: 0x0 -# CHECK: vmsize: 0x100000000 -# CHECK: fileoff: 0 -# CHECK: filesize: 0 +# CHECK-NEXT: Cmd: LC_SEGMENT_64 +# CHECK-NEXT: Name: __PAGEZERO +# CHECK-NEXT: Size: 72 +# CHECK-NEXT: vmaddr: 0x0 +# CHECK-NEXT: vmsize: 0x100000000 +# CHECK-NEXT: fileoff: 0 +# CHECK-NEXT: filesize: 0 ## The kernel won't execute a binary with the wrong protections for __PAGEZERO. -# CHECK: maxprot: --- -# CHECK: initprot: --- -# CHECK: nsects: 0 -# CHECK: flags: 0x0 -# CHECK: } -# CHECK: Segment { -# CHECK: Cmd: LC_SEGMENT_64 -# CHECK: Name: __TEXT -# CHECK: Size: 152 -# CHECK: vmaddr: 0x100000000 -# CHECK: vmsize: +# CHECK-NEXT: maxprot: --- +# CHECK-NEXT: initprot: --- +# CHECK-NEXT: nsects: 0 +# CHECK-NEXT: flags: 0x0 +# CHECK-NEXT: } +# CHECK-NEXT: Segment { +# CHECK-NEXT: Cmd: LC_SEGMENT_64 +# CHECK-NEXT: Name: __TEXT +# CHECK-NEXT: Size: 152 +# CHECK-NEXT: vmaddr: 0x100000000 +# CHECK-NEXT: vmsize: ## dyld3 assumes that the __TEXT segment starts from the file header -# CHECK: fileoff: 0 -# CHECK: filesize: -# CHECK: maxprot: rwx -# CHECK: initprot: r-x -# CHECK: nsects: 1 -# CHECK: flags: 0x0 -# CHECK: } +# CHECK-NEXT: fileoff: 0 +# CHECK-NEXT: filesize: +# CHECK-NEXT: maxprot: rwx +# CHECK-NEXT: initprot: r-x +# CHECK-NEXT: nsects: 1 +# CHECK-NEXT: flags: 0x0 +# CHECK-NEXT: } ## Check that we handle max-length names correctly. # CHECK: Cmd: LC_SEGMENT_64 # CHECK-NEXT: Name: maxlen_16ch_name -## This segment must always be present at the end of an executable. +## This segment must always be present at the end of an executable, and cover +## its last byte. # CHECK: Name: __LINKEDIT -# CHECK: maxprot: rwx -# CHECK: initprot: r-- +# CHECK-NEXT: Size: +# CHECK-NEXT: vmaddr: +# CHECK-NEXT: vmsize: +# CHECK-NEXT: fileoff: [[#%u, LINKEDIT_OFF:]] +# CHECK-NEXT: filesize: [[#%u, LINKEDIT_SIZE:]] +# CHECK-NEXT: maxprot: rwx +# CHECK-NEXT: initprot: r-- # CHECK-NOT: Cmd: LC_SEGMENT_64 +# CHECK-LABEL: Total file size +# CHECK-NEXT: [[#%u, LINKEDIT_OFF + LINKEDIT_SIZE]] + .text .global _main _main: diff --git a/lld/test/MachO/silent-ignore.test b/lld/test/MachO/silent-ignore.test index ae68dd8fe81ff..0d2086386fa8f 100644 --- a/lld/test/MachO/silent-ignore.test +++ b/lld/test/MachO/silent-ignore.test @@ -4,6 +4,7 @@ RUN: -dynamic \ RUN: -no_deduplicate \ RUN: -lto_library /lib/foo \ RUN: -macosx_version_min 0 \ +RUN: -dependency_info /path/to/dependency_info.dat \ RUN: -syslibroot /path/to/MacOSX.platform/Developer/SDKs/MacOSX.sdk RUN: not lld -flavor darwinnew -v --not-an-ignored-argument 2>&1 | FileCheck %s CHECK: error: unknown argument: --not-an-ignored-argument diff --git a/lld/test/MachO/weak-definition-direct-fetch.s b/lld/test/MachO/weak-definition-direct-fetch.s new file mode 100644 index 0000000000000..04c022e9c086d --- /dev/null +++ b/lld/test/MachO/weak-definition-direct-fetch.s @@ -0,0 +1,90 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This test exercises the various possible combinations of weak and non-weak +## symbols that get referenced directly by a relocation in an object file. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo; .section __TEXT,nonweak; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/foo.o +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,weak; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weakfoo.o + +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libfoo.dylib %t/foo.o -o %t/libfoo.dylib +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libweakfoo.dylib %t/weakfoo.o -o %t/libweakfoo.dylib + +# RUN: llvm-objdump --macho --exports-trie %t/libweakfoo.dylib | FileCheck %s --check-prefix WEAK-DYLIB-CHECK +# WEAK-DYLIB-CHECK: _foo [weak_def] + +## Make sure we are using the export trie and not the symbol table when linking +## against these dylibs. +# RUN: llvm-strip %t/libfoo.dylib +# RUN: llvm-strip %t/libweakfoo.dylib +# RUN: llvm-nm %t/libfoo.dylib 2>&1 | FileCheck %s --check-prefix=NOSYM +# RUN: llvm-nm %t/libweakfoo.dylib 2>&1 | FileCheck %s --check-prefix=NOSYM +# NOSYM: no symbols + +# RUN: rm -f %t/foo.a +# RUN: llvm-ar --format=darwin rcs %t/foo.a %t/foo.o +# RUN: rm -f %t/weakfoo.a +# RUN: llvm-ar --format=darwin rcs %t/weakfoo.a %t/weakfoo.o + +## End of input file setup. The following lines check which symbol "wins" when +## there are multiple definitions. + +# PREFER-NONWEAK-DYLIB: __DATA __la_symbol_ptr 0x{{[0-9a-f]+}} libfoo _foo +# PREFER-WEAK-OBJECT: O __TEXT,weak _foo +# PREFER-NONWEAK-OBJECT: O __TEXT,nonweak _foo + +## First, we test the cases where the symbols are of the same type (both from a +## dylib, or both from an archive, etc.) +## +## For dylibs and object files, the non-weak symbol always wins. But the weak +## flag has no effect when we are dealing with two archive symbols. + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-dylibs -Z -L%t -lweakfoo -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-nonweak-dylibs | FileCheck %s --check-prefix=PREFER-NONWEAK-DYLIB +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-dylibs -Z -L%t -lfoo -lweakfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-weak-dylibs | FileCheck %s --check-prefix=PREFER-NONWEAK-DYLIB + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-objs -Z -L%t %t/weakfoo.o %t/foo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-nonweak-objs | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-objs -Z -L%t %t/foo.o %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-weak-objs | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-archives -Z -L%t %t/weakfoo.a %t/foo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-nonweak-archives | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-archives -Z -L%t %t/foo.a %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-weak-archives | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +## The remaining lines test symbol pairs of different types. + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-dylib-weak-ar -Z -L%t -lweakfoo %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-dylib-weak-ar | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-ar-weak-dylib -Z -L%t %t/weakfoo.a -lweakfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-ar-weak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-ar-nonweak-dylib -Z -L%t %t/weakfoo.a -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-ar-nonweak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-dylib-weak-ar -Z -L%t -lfoo %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-dylib-weak-ar | FileCheck %s --check-prefix=PREFER-NONWEAK-DYLIB + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-dylib-weak-obj -Z -L%t -lweakfoo %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-dylib-weak-obj | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-weak-dylib -Z -L%t %t/weakfoo.o -lweakfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-obj-weak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-nonweak-dylib -Z -L%t %t/weakfoo.o -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-obj-nonweak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-dylib-weak-obj -Z -L%t -lfoo %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-dylib-weak-obj | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-nonweak-ar -Z -L%t %t/weakfoo.o %t/foo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-obj-nonweak-ar | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-ar-weak-obj -Z -L%t %t/foo.a %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-ar-weak-obj | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +.globl _main +_main: + callq _foo + ret diff --git a/lld/test/MachO/weak-definition-indirect-fetch.s b/lld/test/MachO/weak-definition-indirect-fetch.s new file mode 100644 index 0000000000000..d22e0a370d5e0 --- /dev/null +++ b/lld/test/MachO/weak-definition-indirect-fetch.s @@ -0,0 +1,42 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This tests examines the effect of .weak_definition on symbols in an archive +## that are not referenced directly, but which are still loaded due to some +## other symbol in the archive member being referenced. +## +## In this particular test, _foo isn't referenced directly, but both archives +## will be fetched when linking against the main test file due to its references +## to _bar and _baz. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo, _bar; .section __TEXT,nonweak; _bar: _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/foo.o +# RUN: echo ".globl _foo, _baz; .weak_definition _foo; .section __TEXT,weak; _baz: _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weakfoo.o + +# RUN: rm -f %t/foo.a +# RUN: llvm-ar --format=darwin rcs %t/foo.a %t/foo.o +# RUN: rm -f %t/weakfoo.a +# RUN: llvm-ar --format=darwin rcs %t/weakfoo.a %t/weakfoo.o + +# PREFER-NONWEAK-OBJECT: O __TEXT,nonweak _foo + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-archives -Z -L%t %t/weakfoo.a %t/foo.a %t/test.o +# RUN: llvm-objdump --syms %t/weak-nonweak-archives | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-archives -Z -L%t %t/foo.a %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --syms %t/nonweak-weak-archives | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-nonweak-objs -Z -L%t %t/weakfoo.o %t/foo.o %t/test.o +# RUN: llvm-objdump --syms %t/weak-nonweak-objs | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-weak-objs -Z -L%t %t/foo.o %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --syms %t/nonweak-weak-objs | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-nonweak-ar -Z -L%t %t/weakfoo.o %t/foo.a %t/test.o +# RUN: llvm-objdump --syms %t/weak-obj-nonweak-ar | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-ar-weak-obj -Z -L%t %t/foo.a %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --syms %t/nonweak-ar-weak-obj | FileCheck %s --check-prefix=PREFER-NONWEAK-OBJECT + +.globl _main +_main: + callq _bar + callq _baz + ret diff --git a/lld/test/MachO/weak-definition-order.s b/lld/test/MachO/weak-definition-order.s new file mode 100644 index 0000000000000..6770a5f76b391 --- /dev/null +++ b/lld/test/MachO/weak-definition-order.s @@ -0,0 +1,36 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This test demonstrates that when we have two weak symbols of the same type, +## we pick the one whose containing file appears earlier in the command-line +## invocation. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,weak1; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weak1.o +# RUN: echo ".globl _foo; .weak_definition _foo; .section __TEXT,weak2; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weak2.o + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/obj12 -Z -L%t %t/weak1.o %t/weak2.o %t/test.o +# RUN: llvm-objdump --syms %t/obj12 | FileCheck %s --check-prefix=WEAK1 +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/obj21 -Z -L%t %t/weak2.o %t/weak1.o %t/test.o +# RUN: llvm-objdump --syms %t/obj21 | FileCheck %s --check-prefix=WEAK2 + +# WEAK1: O __TEXT,weak1 _foo +# WEAK2: O __TEXT,weak2 _foo + +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libweak1.dylib %t/weak1.o -o %t/libweak1.dylib +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libweak2.dylib %t/weak2.o -o %t/libweak2.dylib + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/dylib12 -Z -L%t -lweak1 -lweak2 %t/test.o +# RUN: llvm-objdump --macho --lazy-bind %t/dylib12 | FileCheck %s --check-prefix=DYLIB1 +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/dylib21 -Z -L%t -lweak2 -lweak1 %t/test.o +# RUN: llvm-objdump --macho --lazy-bind %t/dylib21 | FileCheck %s --check-prefix=DYLIB2 +## TODO: these should really be in the weak binding section, not the lazy binding section +# DYLIB1: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} libweak1 _foo +# DYLIB2: __DATA __la_symbol_ptr 0x{{[0-9a-f]*}} libweak2 _foo + +.globl _main +_main: + callq _foo + ret diff --git a/lld/test/MachO/weak-definition-over-dysym.s b/lld/test/MachO/weak-definition-over-dysym.s new file mode 100644 index 0000000000000..e3cf030b7149f --- /dev/null +++ b/lld/test/MachO/weak-definition-over-dysym.s @@ -0,0 +1,39 @@ +# REQUIRES: x86 +# RUN: mkdir -p %t + +## This test demonstrates that when an archive file is fetched, its symbols +## always override any conflicting dylib symbols, regardless of any weak +## definition flags. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo; _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/libfoo.o +# RUN: lld -flavor darwinnew -dylib -install_name \ +# RUN: @executable_path/libfoo.dylib %t/libfoo.o -o %t/libfoo.dylib + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: echo ".globl _foo, _bar; .section __TEXT,nonweak; _bar: _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/foo.o +# RUN: echo ".globl _foo, _bar; .weak_definition _foo; .section __TEXT,weak; _bar: _foo:" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t/weakfoo.o + +# RUN: rm -f %t/foo.a +# RUN: llvm-ar --format=darwin rcs %t/foo.a %t/foo.o +# RUN: rm -f %t/weakfoo.a +# RUN: llvm-ar --format=darwin rcs %t/weakfoo.a %t/weakfoo.o + +# PREFER-WEAK-OBJECT: O __TEXT,weak _foo +# PREFER-NONWEAK-OBJECT: O __TEXT,nonweak _foo + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-dylib-weak-ar -Z -L%t -lfoo %t/weakfoo.a %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-dylib-weak-ar | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-ar-nonweak-dylib -Z -L%t %t/weakfoo.a -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-ar-nonweak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/weak-obj-nonweak-dylib -Z -L%t %t/weakfoo.o -lfoo %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/weak-obj-nonweak-dylib | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT +# RUN: lld -flavor darwinnew -L%S/Inputs/MacOSX.sdk/usr/lib -lSystem -o %t/nonweak-dylib-weak-obj -Z -L%t -lfoo %t/weakfoo.o %t/test.o +# RUN: llvm-objdump --macho --lazy-bind --syms %t/nonweak-dylib-weak-obj | FileCheck %s --check-prefix=PREFER-WEAK-OBJECT + +.globl _main +_main: + callq _foo + callq _bar + ret diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test index 385822c7e1f70..faac3a0be57d0 100644 --- a/lld/test/MinGW/driver.test +++ b/lld/test/MinGW/driver.test @@ -256,3 +256,7 @@ RUN: ld.lld -### -m i386pep foo.o -section-alignment 0x2000 | FileCheck -check-p RUN: ld.lld -### -m i386pep foo.o --section-alignment=0x2000 | FileCheck -check-prefix ALIGN %s RUN: ld.lld -### -m i386pep foo.o -section-alignment=0x2000 | FileCheck -check-prefix ALIGN %s ALIGN: -align:0x2000 + +RUN: ld.lld -### -m i386pe foo.o -no-seh | FileCheck -check-prefix NOSEH %s +RUN: ld.lld -### -m i386pe foo.o --no-seh | FileCheck -check-prefix NOSEH %s +NOSEH: -noseh diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py index 0fa9b48c3c792..267f8c5178584 100644 --- a/lld/test/lit.cfg.py +++ b/lld/test/lit.cfg.py @@ -39,9 +39,9 @@ llvm_config.use_lld() tool_patterns = [ - 'extract', 'llc', 'llvm-as', 'llvm-mc', 'llvm-nm', 'llvm-objdump', - 'llvm-pdbutil', 'llvm-dwarfdump', 'llvm-readelf', 'llvm-readobj', - 'obj2yaml', 'yaml2obj', 'opt', 'llvm-dis'] + 'llc', 'llvm-as', 'llvm-mc', 'llvm-nm', 'llvm-objdump', 'llvm-pdbutil', + 'llvm-dwarfdump', 'llvm-readelf', 'llvm-readobj', 'obj2yaml', 'yaml2obj', + 'opt', 'llvm-dis'] llvm_config.add_tool_substitutions(tool_patterns) @@ -87,7 +87,7 @@ # Indirectly check if the mt.exe Microsoft utility exists by searching for # cvtres, which always accompanies it. Alternatively, check if we can use # libxml2 to merge manifests. -if (lit.util.which('cvtres', config.environment['PATH']) or +if (lit.util.which('cvtres', config.environment['PATH']) or config.llvm_libxml2_enabled): config.available_features.add('manifest_tool') diff --git a/lld/unittests/CMakeLists.txt b/lld/unittests/CMakeLists.txt index 88cb85a084017..84d35d43f4e87 100644 --- a/lld/unittests/CMakeLists.txt +++ b/lld/unittests/CMakeLists.txt @@ -12,9 +12,5 @@ function(add_lld_unittest test_dirname) target_link_libraries(${test_dirname} ${LLVM_COMMON_LIBS}) endfunction() -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_subdirectory(DriverTests) add_subdirectory(MachOTests) diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt index 14cfba1f64ff5..df289747cf3a7 100644 --- a/lldb/CMakeLists.txt +++ b/lldb/CMakeLists.txt @@ -85,6 +85,16 @@ add_subdirectory(source) add_subdirectory(tools) add_subdirectory(docs) +if (LLDB_ENABLE_PYTHON OR LLDB_ENABLE_LUA) + if(LLDB_BUILD_FRAMEWORK) + set(lldb_python_target_dir "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/Resources/Python/lldb") + else() + set(lldb_python_target_dir "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/${LLDB_PYTHON_RELATIVE_PATH}/lldb") + endif() + get_target_property(lldb_bindings_dir swig_wrapper BINARY_DIR) + finish_swig("finish_swig" "${lldb_bindings_dir}" "${lldb_python_target_dir}") +endif() + option(LLDB_INCLUDE_TESTS "Generate build targets for the LLDB unit tests." ${LLVM_INCLUDE_TESTS}) if(LLDB_INCLUDE_TESTS) add_subdirectory(test) @@ -92,164 +102,6 @@ if(LLDB_INCLUDE_TESTS) add_subdirectory(utils) endif() -if (LLDB_ENABLE_PYTHON) - get_target_property(lldb_bindings_dir swig_wrapper BINARY_DIR) - - if(LLDB_BUILD_FRAMEWORK) - set(lldb_python_build_path "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/Resources/Python/lldb") - else() - set(lldb_python_build_path "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/${LLDB_PYTHON_RELATIVE_PATH}/lldb") - endif() - - # Add a Post-Build Event to copy over Python files and create the symlink - # to liblldb.so for the Python API(hardlink on Windows). - add_custom_target(finish_swig ALL VERBATIM - COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_build_path} - DEPENDS ${lldb_bindings_dir}/lldb.py - COMMENT "Python script sym-linking LLDB Python API") - - if(NOT LLDB_USE_SYSTEM_SIX) - add_custom_command(TARGET finish_swig POST_BUILD VERBATIM - COMMAND ${CMAKE_COMMAND} -E copy - "${LLDB_SOURCE_DIR}/third_party/Python/module/six/six.py" - "${lldb_python_build_path}/../six.py") - endif() - - add_custom_command(TARGET finish_swig POST_BUILD VERBATIM - COMMAND ${CMAKE_COMMAND} -E copy - "${lldb_bindings_dir}/lldb.py" - "${lldb_python_build_path}/__init__.py") - - function(create_python_package pkg_dir) - cmake_parse_arguments(ARG "NOINIT" "" "FILES" ${ARGN}) - if(ARG_FILES) - set(copy_cmd COMMAND ${CMAKE_COMMAND} -E copy ${ARG_FILES} ${pkg_dir}) - endif() - if(NOT ARG_NOINIT) - set(init_cmd COMMAND ${PYTHON_EXECUTABLE} - "${LLDB_SOURCE_DIR}/bindings/python/createPythonInit.py" - "${pkg_dir}" ${ARG_FILES}) - endif() - add_custom_command(TARGET finish_swig POST_BUILD VERBATIM - COMMAND ${CMAKE_COMMAND} -E make_directory ${pkg_dir} - ${copy_cmd} - ${init_cmd} - WORKING_DIRECTORY ${lldb_python_build_path}) - endfunction() - - add_custom_command(TARGET finish_swig POST_BUILD VERBATIM - COMMAND ${CMAKE_COMMAND} -E copy - "${LLDB_SOURCE_DIR}/source/Interpreter/embedded_interpreter.py" ${lldb_python_build_path}) - - # Distribute the examples as python packages. - create_python_package("formatters/cpp" - FILES "${LLDB_SOURCE_DIR}/examples/synthetic/gnu_libstdcpp.py" - "${LLDB_SOURCE_DIR}/examples/synthetic/libcxx.py") - - create_python_package("formatters" - FILES "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/cache.py" - "${LLDB_SOURCE_DIR}/examples/summaries/synth.py" - "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/metrics.py" - "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/attrib_fromdict.py" - "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/Logger.py") - - create_python_package("utils" - FILES "${LLDB_SOURCE_DIR}/examples/python/in_call_stack.py" - "${LLDB_SOURCE_DIR}/examples/python/symbolication.py") - - if(APPLE) - create_python_package("macosx" - FILES "${LLDB_SOURCE_DIR}/examples/python/crashlog.py" - "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap.py") - - create_python_package("macosx/heap" - FILES "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap/heap_find.cpp" - "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap/Makefile" - NOINIT) - - create_python_package("diagnose" - FILES "${LLDB_SOURCE_DIR}/examples/python/diagnose_unwind.py" - "${LLDB_SOURCE_DIR}/examples/python/diagnose_nsstring.py") - endif() - - function(create_relative_symlink target dest_file output_dir output_name) - get_filename_component(dest_file ${dest_file} ABSOLUTE) - get_filename_component(output_dir ${output_dir} ABSOLUTE) - file(RELATIVE_PATH rel_dest_file ${output_dir} ${dest_file}) - if(CMAKE_HOST_UNIX) - set(LLVM_LINK_OR_COPY create_symlink) - else() - set(LLVM_LINK_OR_COPY copy) - endif() - add_custom_command(TARGET ${target} POST_BUILD VERBATIM - COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} ${rel_dest_file} ${output_name} - WORKING_DIRECTORY ${output_dir}) - endfunction() - - if(LLDB_BUILD_FRAMEWORK) - set(LIBLLDB_SYMLINK_DEST "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/LLDB") - else() - set(LIBLLDB_SYMLINK_DEST "${LLVM_SHLIB_OUTPUT_INTDIR}/liblldb${CMAKE_SHARED_LIBRARY_SUFFIX}") - endif() - if(WIN32) - if(CMAKE_BUILD_TYPE STREQUAL Debug) - set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb_d.pyd") - else() - set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb.pyd") - endif() - else() - set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb.so") - endif() - create_relative_symlink(finish_swig ${LIBLLDB_SYMLINK_DEST} - ${lldb_python_build_path} ${LIBLLDB_SYMLINK_OUTPUT_FILE}) - - if(NOT LLDB_BUILD_FRAMEWORK) - set(LLDB_ARGDUMPER_FILENAME "lldb-argdumper${CMAKE_EXECUTABLE_SUFFIX}") - create_relative_symlink(finish_swig "${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLDB_ARGDUMPER_FILENAME}" - ${lldb_python_build_path} ${LLDB_ARGDUMPER_FILENAME}) - endif() - - add_dependencies(finish_swig swig_wrapper liblldb lldb-argdumper) - set_target_properties(finish_swig swig_wrapper PROPERTIES FOLDER "lldb misc") - - # Ensure we do the python post-build step when building lldb. - add_dependencies(lldb finish_swig) - - # Install the LLDB python module - if(LLDB_BUILD_FRAMEWORK) - set(LLDB_PYTHON_INSTALL_PATH ${LLDB_FRAMEWORK_INSTALL_DIR}/LLDB.framework/Resources/Python) - else() - set(LLDB_PYTHON_INSTALL_PATH ${LLDB_PYTHON_RELATIVE_PATH}) - endif() - if (NOT CMAKE_CFG_INTDIR STREQUAL ".") - string(REPLACE ${CMAKE_CFG_INTDIR} "\$\{CMAKE_INSTALL_CONFIG_NAME\}" LLDB_PYTHON_INSTALL_PATH ${LLDB_PYTHON_INSTALL_PATH}) - string(REPLACE ${CMAKE_CFG_INTDIR} "\$\{CMAKE_INSTALL_CONFIG_NAME\}" lldb_python_build_path ${lldb_python_build_path}) - endif() - add_custom_target(lldb-python-scripts) - add_dependencies(lldb-python-scripts finish_swig) - install(DIRECTORY ${lldb_python_build_path}/../ - DESTINATION ${LLDB_PYTHON_INSTALL_PATH} - COMPONENT lldb-python-scripts) - if (NOT LLVM_ENABLE_IDE) - add_llvm_install_targets(install-lldb-python-scripts - COMPONENT lldb-python-scripts - DEPENDS lldb-python-scripts) - endif() - - # Add a Post-Build Event to copy the custom Python DLL to the lldb binaries dir so that Windows can find it when launching - # lldb.exe or any other executables that were linked with liblldb. - if (WIN32 AND NOT "${PYTHON_DLL}" STREQUAL "") - # When using the Visual Studio CMake generator the lldb binaries end up in Release/bin, Debug/bin etc. - file(TO_NATIVE_PATH "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin" LLDB_BIN_DIR) - file(TO_NATIVE_PATH "${PYTHON_DLL}" PYTHON_DLL_NATIVE_PATH) - add_custom_command( - TARGET finish_swig - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_DLL_NATIVE_PATH} ${LLDB_BIN_DIR} VERBATIM - COMMENT "Copying Python DLL to LLDB binaries directory.") - endif () -endif () - if(LLDB_BUILT_STANDALONE AND NOT LLVM_ENABLE_IDE) llvm_distribution_add_targets() endif() diff --git a/lldb/bindings/CMakeLists.txt b/lldb/bindings/CMakeLists.txt index a2e51c263f7e6..3440f605bca5f 100644 --- a/lldb/bindings/CMakeLists.txt +++ b/lldb/bindings/CMakeLists.txt @@ -75,3 +75,171 @@ if (LLDB_ENABLE_LUA) ${CMAKE_CURRENT_BINARY_DIR}/LLDBWrapLua.cpp ) endif() + +function(create_python_package swig_target working_dir pkg_dir) + cmake_parse_arguments(ARG "NOINIT" "" "FILES" ${ARGN}) + if(ARG_FILES) + set(copy_cmd COMMAND ${CMAKE_COMMAND} -E copy ${ARG_FILES} ${pkg_dir}) + endif() + if(NOT ARG_NOINIT) + set(init_cmd COMMAND ${PYTHON_EXECUTABLE} + "${LLDB_SOURCE_DIR}/bindings/python/createPythonInit.py" + "${pkg_dir}" ${ARG_FILES}) + endif() + add_custom_command(TARGET ${swig_target} POST_BUILD VERBATIM + COMMAND ${CMAKE_COMMAND} -E make_directory ${pkg_dir} + ${copy_cmd} + ${init_cmd} + WORKING_DIRECTORY ${working_dir}) +endfunction() + +function(create_relative_symlink swig_target dest_file output_dir output_name) + get_filename_component(dest_file ${dest_file} ABSOLUTE) + get_filename_component(output_dir ${output_dir} ABSOLUTE) + file(RELATIVE_PATH rel_dest_file ${output_dir} ${dest_file}) + if(CMAKE_HOST_UNIX) + set(LLVM_LINK_OR_COPY create_symlink) + else() + set(LLVM_LINK_OR_COPY copy) + endif() + add_custom_command(TARGET ${swig_target} POST_BUILD VERBATIM + COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} ${rel_dest_file} ${output_name} + WORKING_DIRECTORY ${output_dir}) +endfunction() + +function(finish_swig swig_target lldb_bindings_dir lldb_python_target_dir) + # Add a Post-Build Event to copy over Python files and create the symlink to + # liblldb.so for the Python API(hardlink on Windows). + add_custom_target(${swig_target} ALL VERBATIM + COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_target_dir} + DEPENDS ${lldb_bindings_dir}/lldb.py + COMMENT "Python script sym-linking LLDB Python API") + + if(NOT LLDB_USE_SYSTEM_SIX) + add_custom_command(TARGET ${swig_target} POST_BUILD VERBATIM + COMMAND ${CMAKE_COMMAND} -E copy + "${LLDB_SOURCE_DIR}/third_party/Python/module/six/six.py" + "${lldb_python_target_dir}/../six.py") + endif() + + add_custom_command(TARGET ${swig_target} POST_BUILD VERBATIM + COMMAND ${CMAKE_COMMAND} -E copy + "${lldb_bindings_dir}/lldb.py" + "${lldb_python_target_dir}/__init__.py") + + add_custom_command(TARGET ${swig_target} POST_BUILD VERBATIM + COMMAND ${CMAKE_COMMAND} -E copy + "${LLDB_SOURCE_DIR}/source/Interpreter/embedded_interpreter.py" + "${lldb_python_target_dir}") + + # Distribute the examples as python packages. + create_python_package( + ${swig_target} + ${lldb_python_target_dir} + "formatters/cpp" + FILES "${LLDB_SOURCE_DIR}/examples/synthetic/gnu_libstdcpp.py" + "${LLDB_SOURCE_DIR}/examples/synthetic/libcxx.py") + + create_python_package( + ${swig_target} + ${lldb_python_target_dir} + "formatters" + FILES "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/cache.py" + "${LLDB_SOURCE_DIR}/examples/summaries/synth.py" + "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/metrics.py" + "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/attrib_fromdict.py" + "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/Logger.py") + + create_python_package( + ${swig_target} + ${lldb_python_target_dir} + "utils" + FILES "${LLDB_SOURCE_DIR}/examples/python/in_call_stack.py" + "${LLDB_SOURCE_DIR}/examples/python/symbolication.py") + + if(APPLE) + create_python_package( + ${swig_target} + ${lldb_python_target_dir} "macosx" + FILES "${LLDB_SOURCE_DIR}/examples/python/crashlog.py" + "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap.py") + + create_python_package( + ${swig_target} + ${lldb_python_target_dir} "macosx/heap" + FILES "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap/heap_find.cpp" + "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap/Makefile" + NOINIT) + + create_python_package( + ${swig_target} + ${lldb_python_target_dir} "diagnose" + FILES "${LLDB_SOURCE_DIR}/examples/python/diagnose_unwind.py" + "${LLDB_SOURCE_DIR}/examples/python/diagnose_nsstring.py") + endif() + + if(LLDB_BUILD_FRAMEWORK) + set(LIBLLDB_SYMLINK_DEST "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/LLDB") + else() + set(LIBLLDB_SYMLINK_DEST "${LLVM_SHLIB_OUTPUT_INTDIR}/liblldb${CMAKE_SHARED_LIBRARY_SUFFIX}") + endif() + if(WIN32) + if(CMAKE_BUILD_TYPE STREQUAL Debug) + set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb_d.pyd") + else() + set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb.pyd") + endif() + else() + set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb.so") + endif() + create_relative_symlink(${swig_target} ${LIBLLDB_SYMLINK_DEST} + ${lldb_python_target_dir} ${LIBLLDB_SYMLINK_OUTPUT_FILE}) + + if(NOT LLDB_BUILD_FRAMEWORK) + set(LLDB_ARGDUMPER_FILENAME "lldb-argdumper${CMAKE_EXECUTABLE_SUFFIX}") + create_relative_symlink(${swig_target} "${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLDB_ARGDUMPER_FILENAME}" + ${lldb_python_target_dir} ${LLDB_ARGDUMPER_FILENAME}) + endif() + + add_dependencies(${swig_target} swig_wrapper liblldb lldb-argdumper) + set_target_properties(${swig_target} swig_wrapper PROPERTIES FOLDER "lldb misc") + + # Ensure we do the python post-build step when building lldb. + add_dependencies(lldb ${swig_target}) + + # Install the LLDB python module + if(LLDB_BUILD_FRAMEWORK) + set(LLDB_PYTHON_INSTALL_PATH ${LLDB_FRAMEWORK_INSTALL_DIR}/LLDB.framework/Resources/Python) + else() + set(LLDB_PYTHON_INSTALL_PATH ${LLDB_PYTHON_RELATIVE_PATH}) + endif() + if (NOT CMAKE_CFG_INTDIR STREQUAL ".") + string(REPLACE ${CMAKE_CFG_INTDIR} "\$\{CMAKE_INSTALL_CONFIG_NAME\}" LLDB_PYTHON_INSTALL_PATH ${LLDB_PYTHON_INSTALL_PATH}) + string(REPLACE ${CMAKE_CFG_INTDIR} "\$\{CMAKE_INSTALL_CONFIG_NAME\}" lldb_python_target_dir ${lldb_python_target_dir}) + endif() + set(swig_scripts_target "${swig_target}_scripts") + set(swig_scripts_install_target "${swig_target}_scripts_install") + add_custom_target(${swig_scripts_target}) + add_dependencies(${swig_scripts_target} ${swig_target}) + install(DIRECTORY ${lldb_python_target_dir}/../ + DESTINATION ${LLDB_PYTHON_INSTALL_PATH} + COMPONENT ${swig_scripts_target}) + if (NOT LLVM_ENABLE_IDE) + add_llvm_install_targets(${swig_scripts_install_target} + COMPONENT ${swig_scripts_target} + DEPENDS ${swig_scripts_target}) + endif() + + # Add a Post-Build Event to copy the custom Python DLL to the lldb binaries dir so that Windows can find it when launching + # lldb.exe or any other executables that were linked with liblldb. + if (WIN32 AND NOT "${PYTHON_DLL}" STREQUAL "") + # When using the Visual Studio CMake generator the lldb binaries end up in Release/bin, Debug/bin etc. + file(TO_NATIVE_PATH "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin" LLDB_BIN_DIR) + file(TO_NATIVE_PATH "${PYTHON_DLL}" PYTHON_DLL_NATIVE_PATH) + add_custom_command( + TARGET ${swig_target} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_DLL_NATIVE_PATH} ${LLDB_BIN_DIR} VERBATIM + COMMENT "Copying Python DLL to LLDB binaries directory.") + endif() +endfunction() diff --git a/lldb/cmake/modules/FindPythonInterpAndLibs.cmake b/lldb/cmake/modules/FindPythonInterpAndLibs.cmake index 243e0463f48b6..3a64ebbcf9721 100644 --- a/lldb/cmake/modules/FindPythonInterpAndLibs.cmake +++ b/lldb/cmake/modules/FindPythonInterpAndLibs.cmake @@ -61,46 +61,22 @@ if(PYTHON_LIBRARIES AND PYTHON_INCLUDE_DIRS AND PYTHON_EXECUTABLE AND SWIG_EXECU else() find_package(SWIG 2.0) if (SWIG_FOUND) - if(NOT CMAKE_VERSION VERSION_LESS 3.12) - if (LLDB_PYTHON_VERSION) - if (LLDB_PYTHON_VERSION VERSION_EQUAL "2") - FindPython2() - elseif(LLDB_PYTHON_VERSION VERSION_EQUAL "3") - FindPython3() - endif() - else() + if (LLDB_PYTHON_VERSION) + if (LLDB_PYTHON_VERSION VERSION_EQUAL "2") + FindPython2() + elseif(LLDB_PYTHON_VERSION VERSION_EQUAL "3") FindPython3() - if (NOT PYTHON3_FOUND AND NOT CMAKE_SYSTEM_NAME STREQUAL Windows) - FindPython2() - endif() endif() else() - find_package(PythonInterp) - find_package(PythonLibs) - if(PYTHONINTERP_FOUND AND PYTHONLIBS_FOUND AND SWIG_FOUND) - if (NOT CMAKE_CROSSCOMPILING) - string(REPLACE "." ";" pythonlibs_version_list ${PYTHONLIBS_VERSION_STRING}) - list(GET pythonlibs_version_list 0 pythonlibs_major) - list(GET pythonlibs_version_list 1 pythonlibs_minor) - - # Ignore the patch version. Some versions of macOS report a different - # patch version for the system provided interpreter and libraries. - if (CMAKE_CROSSCOMPILING OR (PYTHON_VERSION_MAJOR VERSION_EQUAL pythonlibs_major AND - PYTHON_VERSION_MINOR VERSION_EQUAL pythonlibs_minor)) - mark_as_advanced( - PYTHON_LIBRARIES - PYTHON_INCLUDE_DIRS - PYTHON_EXECUTABLE - SWIG_EXECUTABLE) - endif() - endif() + FindPython3() + if (NOT PYTHON3_FOUND AND NOT CMAKE_SYSTEM_NAME STREQUAL Windows) + FindPython2() endif() endif() else() message(STATUS "SWIG 2 or later is required for Python support in LLDB but could not be found") endif() - include(FindPackageHandleStandardArgs) find_package_handle_standard_args(PythonInterpAndLibs FOUND_VAR diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake index 8465cfe3b7b72..7e5848c800f87 100644 --- a/lldb/cmake/modules/LLDBConfig.cmake +++ b/lldb/cmake/modules/LLDBConfig.cmake @@ -79,11 +79,6 @@ if(LLDB_BUILD_FRAMEWORK) if(NOT APPLE) message(FATAL_ERROR "LLDB.framework can only be generated when targeting Apple platforms") endif() - # CMake 3.6 did not correctly emit POST_BUILD commands for Apple Framework targets - # CMake < 3.8 did not have the BUILD_RPATH target property - if(CMAKE_VERSION VERSION_LESS 3.8) - message(FATAL_ERROR "LLDB_BUILD_FRAMEWORK is not supported on CMake < 3.8") - endif() set(LLDB_FRAMEWORK_VERSION A CACHE STRING "LLDB.framework version (default is A)") set(LLDB_FRAMEWORK_BUILD_DIR bin CACHE STRING "Output directory for LLDB.framework") diff --git a/lldb/cmake/modules/LLDBStandalone.cmake b/lldb/cmake/modules/LLDBStandalone.cmake index 752113bcc6c41..94781c3583744 100644 --- a/lldb/cmake/modules/LLDBStandalone.cmake +++ b/lldb/cmake/modules/LLDBStandalone.cmake @@ -73,6 +73,7 @@ endif() # We append the directory in which LLVMConfig.cmake lives. We expect LLVM's # CMake modules to be in that directory as well. list(APPEND CMAKE_MODULE_PATH "${LLVM_DIR}") + include(AddLLVM) include(TableGen) include(HandleLLVMOptions) diff --git a/lldb/docs/resources/bots.rst b/lldb/docs/resources/bots.rst index efe8c7116b01e..d9ddcde41abcc 100644 --- a/lldb/docs/resources/bots.rst +++ b/lldb/docs/resources/bots.rst @@ -10,16 +10,9 @@ LLVM Buildbot is the place where volunteers provide build machines. Everyone can * `lldb-x64-windows-ninja `_ * `lldb-x86_64-debian `_ * `lldb-aarch64-ubuntu `_ +* `lldb-arm-ubuntu `_ * `lldb-x86_64-fedora `_ -Documentation -------------- - -The documentation bot validates that the website builds correctly with Sphinx. -It does not generate the website itself, which happens on a separate server. - -* `lldb-sphinx-docs `_ - GreenDragon ----------- @@ -28,7 +21,14 @@ GreenDragon builds and tests LLDB on macOS. It has a `dedicated tab * `lldb-cmake `_ * `lldb-cmake-matrix `_ -* `lldb-cmake-python3 `_ +* `lldb-cmake-reproducers `_ * `lldb-cmake-standalone `_ * `lldb-cmake-sanitized `_ +Documentation +------------- + +The documentation bot validates that the website builds correctly with Sphinx. +It does not generate the website itself, which happens on a separate server. + +* `lldb-sphinx-docs `_ diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h index 83c38d3b61664..f7390cfabf01c 100644 --- a/lldb/include/lldb/API/LLDB.h +++ b/lldb/include/lldb/API/LLDB.h @@ -17,6 +17,7 @@ #include "lldb/API/SBBreakpointName.h" #include "lldb/API/SBBroadcaster.h" #include "lldb/API/SBCommandInterpreter.h" +#include "lldb/API/SBCommandInterpreterRunOptions.h" #include "lldb/API/SBCommandReturnObject.h" #include "lldb/API/SBCommunication.h" #include "lldb/API/SBCompileUnit.h" diff --git a/lldb/include/lldb/API/SBCommandInterpreterRunOptions.h b/lldb/include/lldb/API/SBCommandInterpreterRunOptions.h index 82d6feedc02e7..3c00513faaa71 100644 --- a/lldb/include/lldb/API/SBCommandInterpreterRunOptions.h +++ b/lldb/include/lldb/API/SBCommandInterpreterRunOptions.h @@ -26,8 +26,12 @@ class LLDB_API SBCommandInterpreterRunOptions { public: SBCommandInterpreterRunOptions(); + SBCommandInterpreterRunOptions(const SBCommandInterpreterRunOptions &rhs); ~SBCommandInterpreterRunOptions(); + SBCommandInterpreterRunOptions & + operator=(const SBCommandInterpreterRunOptions &rhs); + bool GetStopOnContinue() const; void SetStopOnContinue(bool); diff --git a/lldb/include/lldb/Breakpoint/Breakpoint.h b/lldb/include/lldb/Breakpoint/Breakpoint.h index d29d21070fd7c..5d3e596c474be 100644 --- a/lldb/include/lldb/Breakpoint/Breakpoint.h +++ b/lldb/include/lldb/Breakpoint/Breakpoint.h @@ -20,6 +20,7 @@ #include "lldb/Breakpoint/BreakpointName.h" #include "lldb/Breakpoint/BreakpointOptions.h" #include "lldb/Breakpoint/Stoppoint.h" +#include "lldb/Breakpoint/StoppointHitCounter.h" #include "lldb/Core/SearchFilter.h" #include "lldb/Utility/Event.h" #include "lldb/Utility/StringList.h" @@ -624,13 +625,6 @@ class Breakpoint : public std::enable_shared_from_this, bool IgnoreCountShouldStop(); - void IncrementHitCount() { m_hit_count++; } - - void DecrementHitCount() { - assert(m_hit_count > 0); - m_hit_count--; - } - private: // To call from CopyFromBreakpoint. Breakpoint(Target &new_target, const Breakpoint &bp_to_copy_from); @@ -660,10 +654,12 @@ class Breakpoint : public std::enable_shared_from_this, m_locations; // The list of locations currently found for this breakpoint. std::string m_kind_description; bool m_resolve_indirect_symbols; - uint32_t m_hit_count; // Number of times this breakpoint/watchpoint has been - // hit. This is kept - // separately from the locations hit counts, since locations can go away when - // their backing library gets unloaded, and we would lose hit counts. + + /// Number of times this breakpoint has been hit. This is kept separately + /// from the locations hit counts, since locations can go away when their + /// backing library gets unloaded, and we would lose hit counts. + StoppointHitCounter m_hit_counter; + BreakpointName::Permissions m_permissions; void SendBreakpointChangedEvent(lldb::BreakpointEventType eventKind); diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocation.h b/lldb/include/lldb/Breakpoint/BreakpointLocation.h index 3fc571eaa292b..4e1c57a404352 100644 --- a/lldb/include/lldb/Breakpoint/BreakpointLocation.h +++ b/lldb/include/lldb/Breakpoint/BreakpointLocation.h @@ -13,7 +13,7 @@ #include #include "lldb/Breakpoint/BreakpointOptions.h" -#include "lldb/Breakpoint/StoppointLocation.h" +#include "lldb/Breakpoint/StoppointHitCounter.h" #include "lldb/Core/Address.h" #include "lldb/Utility/UserID.h" #include "lldb/lldb-private.h" @@ -35,15 +35,14 @@ namespace lldb_private { /// be useful if you've set options on the locations. class BreakpointLocation - : public std::enable_shared_from_this, - public StoppointLocation { + : public std::enable_shared_from_this { public: - ~BreakpointLocation() override; + ~BreakpointLocation(); /// Gets the load address for this breakpoint location \return /// Returns breakpoint location load address, \b /// LLDB_INVALID_ADDRESS if not yet set. - lldb::addr_t GetLoadAddress() const override; + lldb::addr_t GetLoadAddress() const; /// Gets the Address for this breakpoint location \return /// Returns breakpoint location Address. @@ -63,7 +62,7 @@ class BreakpointLocation /// \return /// \b true if this breakpoint location thinks we should stop, /// \b false otherwise. - bool ShouldStop(StoppointCallbackContext *context) override; + bool ShouldStop(StoppointCallbackContext *context); // The next section deals with various breakpoint options. @@ -85,11 +84,14 @@ class BreakpointLocation /// \b true if the breakpoint is set to auto-continue, \b false if not. bool IsAutoContinue() const; + /// Return the current Hit Count. + uint32_t GetHitCount() const { return m_hit_counter.GetValue(); } + /// Return the current Ignore Count. /// /// \return /// The number of breakpoint hits to be ignored. - uint32_t GetIgnoreCount(); + uint32_t GetIgnoreCount() const; /// Set the breakpoint to ignore the next \a count breakpoint hits. /// @@ -192,7 +194,7 @@ class BreakpointLocation void GetDescription(Stream *s, lldb::DescriptionLevel level); /// Standard "Dump" method. At present it does nothing. - void Dump(Stream *s) const override; + void Dump(Stream *s) const; /// Use this to set location specific breakpoint options. /// @@ -268,6 +270,9 @@ class BreakpointLocation /// \b true or \b false as given in the description above. bool EquivalentToLocation(BreakpointLocation &location); + /// Returns the breakpoint location ID. + lldb::break_id_t GetID() const { return m_loc_id; } + protected: friend class BreakpointSite; friend class BreakpointLocationList; @@ -338,6 +343,9 @@ class BreakpointLocation /// multiple processes. size_t m_condition_hash; ///< For testing whether the condition source code ///changed. + lldb::break_id_t m_loc_id; ///< Breakpoint location ID. + StoppointHitCounter m_hit_counter; ///< Number of times this breakpoint + /// location has been hit. void SetShouldResolveIndirectFunctions(bool do_resolve) { m_should_resolve_indirect_functions = do_resolve; diff --git a/lldb/include/lldb/Breakpoint/BreakpointSite.h b/lldb/include/lldb/Breakpoint/BreakpointSite.h index 5ce17f511db44..fc32c04ffe359 100644 --- a/lldb/include/lldb/Breakpoint/BreakpointSite.h +++ b/lldb/include/lldb/Breakpoint/BreakpointSite.h @@ -14,7 +14,8 @@ #include "lldb/Breakpoint/BreakpointLocationCollection.h" -#include "lldb/Breakpoint/StoppointLocation.h" +#include "lldb/Breakpoint/StoppointSite.h" +#include "lldb/Utility/LLDBAssert.h" #include "lldb/Utility/UserID.h" #include "lldb/lldb-forward.h" @@ -32,7 +33,7 @@ namespace lldb_private { /// by the process. class BreakpointSite : public std::enable_shared_from_this, - public StoppointLocation { + public StoppointSite { public: enum Type { eSoftware, // Breakpoint opcode has been written to memory and @@ -60,8 +61,6 @@ class BreakpointSite : public std::enable_shared_from_this, /// Sets the trap opcode bool SetTrapOpcode(const uint8_t *trap_opcode, uint32_t trap_opcode_size); - void SetHardwareIndex(uint32_t index) override; - /// Gets the original instruction bytes that were overwritten by the trap uint8_t *GetSavedOpcodeBytes(); @@ -184,6 +183,12 @@ class BreakpointSite : public std::enable_shared_from_this, /// \b false otherwise. bool IsInternal() const; + bool IsHardware() const override { + lldbassert(BreakpointSite::Type::eHardware == GetType() || + !HardwareRequired()); + return BreakpointSite::Type::eHardware == GetType(); + } + BreakpointSite::Type GetType() const { return m_type; } void SetType(BreakpointSite::Type type) { m_type = type; } diff --git a/lldb/include/lldb/Breakpoint/StoppointHitCounter.h b/lldb/include/lldb/Breakpoint/StoppointHitCounter.h new file mode 100644 index 0000000000000..26f816da6430d --- /dev/null +++ b/lldb/include/lldb/Breakpoint/StoppointHitCounter.h @@ -0,0 +1,43 @@ +//===-- StoppointHitCounter.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_BREAKPOINT_STOPPOINT_HIT_COUNTER_H +#define LLDB_BREAKPOINT_STOPPOINT_HIT_COUNTER_H + +#include +#include +#include + +#include "lldb/Utility/LLDBAssert.h" + +namespace lldb_private { + +class StoppointHitCounter { +public: + uint32_t GetValue() const { return m_hit_count; } + + void Increment(uint32_t difference = 1) { + lldbassert(std::numeric_limits::max() - m_hit_count >= difference); + m_hit_count += difference; + } + + void Decrement(uint32_t difference = 1) { + lldbassert(m_hit_count >= difference); + m_hit_count -= difference; + } + + void Reset() { m_hit_count = 0; } + +private: + /// Number of times this breakpoint/watchpoint has been hit. + uint32_t m_hit_count = 0; +}; + +} // namespace lldb_private + +#endif // LLDB_BREAKPOINT_STOPPOINT_HIT_COUNTER_H diff --git a/lldb/include/lldb/Breakpoint/StoppointLocation.h b/lldb/include/lldb/Breakpoint/StoppointLocation.h deleted file mode 100644 index 4d6ca044ccc43..0000000000000 --- a/lldb/include/lldb/Breakpoint/StoppointLocation.h +++ /dev/null @@ -1,87 +0,0 @@ -//===-- StoppointLocation.h -------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLDB_BREAKPOINT_STOPPOINTLOCATION_H -#define LLDB_BREAKPOINT_STOPPOINTLOCATION_H - -#include "lldb/Utility/UserID.h" -#include "lldb/lldb-private.h" -// #include "lldb/Breakpoint/BreakpointOptions.h" - -namespace lldb_private { - -class StoppointLocation { -public: - // Constructors and Destructors - StoppointLocation(lldb::break_id_t bid, lldb::addr_t m_addr, bool hardware); - - StoppointLocation(lldb::break_id_t bid, lldb::addr_t m_addr, - uint32_t byte_size, bool hardware); - - virtual ~StoppointLocation(); - - // Operators - - // Methods - virtual lldb::addr_t GetLoadAddress() const { return m_addr; } - - virtual void SetLoadAddress(lldb::addr_t addr) { m_addr = addr; } - - uint32_t GetByteSize() const { return m_byte_size; } - - uint32_t GetHitCount() const { return m_hit_count; } - - uint32_t GetHardwareIndex() const { return m_hardware_index; } - - bool HardwareRequired() const { return m_hardware; } - - virtual bool IsHardware() const { - return m_hardware_index != LLDB_INVALID_INDEX32; - } - - virtual bool ShouldStop(StoppointCallbackContext *context) { return true; } - - virtual void Dump(Stream *stream) const {} - - virtual void SetHardwareIndex(uint32_t index) { m_hardware_index = index; } - - lldb::break_id_t GetID() const { return m_loc_id; } - -protected: - // Classes that inherit from StoppointLocation can see and modify these - lldb::break_id_t m_loc_id; // Stoppoint location ID - lldb::addr_t - m_addr; // The load address of this stop point. The base Stoppoint doesn't - // store a full Address since that's not needed for the breakpoint sites. - bool m_hardware; // True if this point has been is required to use hardware - // (which may fail due to lack of resources) - uint32_t m_hardware_index; // The hardware resource index for this - // breakpoint/watchpoint - uint32_t m_byte_size; // The size in bytes of stop location. e.g. the length - // of the trap opcode for - // software breakpoints, or the optional length in bytes for hardware - // breakpoints, or the length of the watchpoint. - uint32_t - m_hit_count; // Number of times this breakpoint/watchpoint has been hit - - // If you override this, be sure to call the base class to increment the - // internal counter. - void IncrementHitCount() { ++m_hit_count; } - - void DecrementHitCount(); - -private: - // For StoppointLocation only - StoppointLocation(const StoppointLocation &) = delete; - const StoppointLocation &operator=(const StoppointLocation &) = delete; - StoppointLocation() = delete; -}; - -} // namespace lldb_private - -#endif // LLDB_BREAKPOINT_STOPPOINTLOCATION_H diff --git a/lldb/include/lldb/Breakpoint/StoppointSite.h b/lldb/include/lldb/Breakpoint/StoppointSite.h new file mode 100644 index 0000000000000..7e5e33486345d --- /dev/null +++ b/lldb/include/lldb/Breakpoint/StoppointSite.h @@ -0,0 +1,81 @@ +//===-- StoppointSite.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_BREAKPOINT_STOPPOINTSITE_H +#define LLDB_BREAKPOINT_STOPPOINTSITE_H + +#include "lldb/Breakpoint/StoppointHitCounter.h" +#include "lldb/Utility/UserID.h" +#include "lldb/lldb-private.h" + +namespace lldb_private { + +class StoppointSite { +public: + StoppointSite(lldb::break_id_t bid, lldb::addr_t m_addr, bool hardware); + + StoppointSite(lldb::break_id_t bid, lldb::addr_t m_addr, + uint32_t byte_size, bool hardware); + + virtual ~StoppointSite() = default; + + virtual lldb::addr_t GetLoadAddress() const { return m_addr; } + + virtual void SetLoadAddress(lldb::addr_t addr) { m_addr = addr; } + + uint32_t GetByteSize() const { return m_byte_size; } + + uint32_t GetHitCount() const { return m_hit_counter.GetValue(); } + + void ResetHitCount() { m_hit_counter.Reset(); } + + bool HardwareRequired() const { return m_is_hardware_required; } + + virtual bool IsHardware() const = 0; + + uint32_t GetHardwareIndex() const { return m_hardware_index; } + + void SetHardwareIndex(uint32_t index) { m_hardware_index = index; } + + virtual bool ShouldStop(StoppointCallbackContext* context) = 0; + + virtual void Dump(Stream* stream) const = 0; + + lldb::break_id_t GetID() const { return m_id; } + +protected: + /// Stoppoint site ID. + lldb::break_id_t m_id; + + /// The load address of this stop point. + lldb::addr_t m_addr; + + /// True if this point is required to use hardware (which may fail due to + /// the lack of resources). + bool m_is_hardware_required; + + /// The hardware resource index for this breakpoint/watchpoint. + uint32_t m_hardware_index; + + /// The size in bytes of stoppoint, e.g. the length of the trap opcode for + /// software breakpoints, or the optional length in bytes for hardware + /// breakpoints, or the length of the watchpoint. + uint32_t m_byte_size; + + /// Number of times this breakpoint/watchpoint has been hit. + StoppointHitCounter m_hit_counter; + +private: + StoppointSite(const StoppointSite &) = delete; + const StoppointSite &operator=(const StoppointSite &) = delete; + StoppointSite() = delete; +}; + +} // namespace lldb_private + +#endif // LLDB_BREAKPOINT_STOPPOINTSITE_H diff --git a/lldb/include/lldb/Breakpoint/Watchpoint.h b/lldb/include/lldb/Breakpoint/Watchpoint.h index bce15f0a85dde..41b723a66b6a3 100644 --- a/lldb/include/lldb/Breakpoint/Watchpoint.h +++ b/lldb/include/lldb/Breakpoint/Watchpoint.h @@ -12,7 +12,7 @@ #include #include -#include "lldb/Breakpoint/StoppointLocation.h" +#include "lldb/Breakpoint/StoppointSite.h" #include "lldb/Breakpoint/WatchpointOptions.h" #include "lldb/Symbol/CompilerType.h" #include "lldb/Target/Target.h" @@ -22,7 +22,7 @@ namespace lldb_private { class Watchpoint : public std::enable_shared_from_this, - public StoppointLocation { + public StoppointSite { public: class WatchpointEventData : public EventData { public: @@ -158,8 +158,6 @@ class Watchpoint : public std::enable_shared_from_this, friend class Target; friend class WatchpointList; - void ResetHitCount() { m_hit_count = 0; } - void ResetHistoricValues() { m_old_value_sp.reset(); m_new_value_sp.reset(); @@ -199,7 +197,7 @@ class Watchpoint : public std::enable_shared_from_this, std::unique_ptr m_condition_up; // The condition to test. - void SetID(lldb::watch_id_t id) { m_loc_id = id; } + void SetID(lldb::watch_id_t id) { m_id = id; } void SendWatchpointChangedEvent(lldb::WatchpointEventType eventKind); diff --git a/lldb/include/lldb/Core/Communication.h b/lldb/include/lldb/Core/Communication.h index 6b65974f95225..354c4bbcc283c 100644 --- a/lldb/include/lldb/Core/Communication.h +++ b/lldb/include/lldb/Core/Communication.h @@ -285,7 +285,7 @@ class Communication : public Broadcaster { /// void SynchronizeWithReadThread(); - static const char *ConnectionStatusAsCString(lldb::ConnectionStatus status); + static std::string ConnectionStatusAsString(lldb::ConnectionStatus status); bool GetCloseOnEOF() const { return m_close_on_eof; } diff --git a/lldb/include/lldb/Core/ValueObject.h b/lldb/include/lldb/Core/ValueObject.h index 0080368fd9965..a557d69f3ae30 100644 --- a/lldb/include/lldb/Core/ValueObject.h +++ b/lldb/include/lldb/Core/ValueObject.h @@ -358,7 +358,7 @@ class ValueObject : public UserID { virtual bool CanProvideValue(); // Subclasses must implement the functions below. - virtual uint64_t GetByteSize() = 0; + virtual llvm::Optional GetByteSize() = 0; virtual lldb::ValueType GetValueType() const = 0; diff --git a/lldb/include/lldb/Core/ValueObjectCast.h b/lldb/include/lldb/Core/ValueObjectCast.h index d91ca6a92be8d..342803f8ca63a 100644 --- a/lldb/include/lldb/Core/ValueObjectCast.h +++ b/lldb/include/lldb/Core/ValueObjectCast.h @@ -30,7 +30,7 @@ class ValueObjectCast : public ValueObject { ConstString name, const CompilerType &cast_type); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; size_t CalculateNumChildren(uint32_t max) override; diff --git a/lldb/include/lldb/Core/ValueObjectChild.h b/lldb/include/lldb/Core/ValueObjectChild.h index c6f44a29b0591..9a9fd9294261a 100644 --- a/lldb/include/lldb/Core/ValueObjectChild.h +++ b/lldb/include/lldb/Core/ValueObjectChild.h @@ -30,7 +30,7 @@ class ValueObjectChild : public ValueObject { public: ~ValueObjectChild() override; - uint64_t GetByteSize() override { return m_byte_size; } + llvm::Optional GetByteSize() override { return m_byte_size; } lldb::offset_t GetByteOffset() override { return m_byte_offset; } diff --git a/lldb/include/lldb/Core/ValueObjectConstResult.h b/lldb/include/lldb/Core/ValueObjectConstResult.h index 0e868c687e931..8d823baa0b7b4 100644 --- a/lldb/include/lldb/Core/ValueObjectConstResult.h +++ b/lldb/include/lldb/Core/ValueObjectConstResult.h @@ -62,7 +62,7 @@ class ValueObjectConstResult : public ValueObject { static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, const Status &error); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override; @@ -113,7 +113,7 @@ class ValueObjectConstResult : public ValueObject { CompilerType GetCompilerTypeImpl() override; ConstString m_type_name; - uint64_t m_byte_size; + llvm::Optional m_byte_size; ValueObjectConstResultImpl m_impl; diff --git a/lldb/include/lldb/Core/ValueObjectDynamicValue.h b/lldb/include/lldb/Core/ValueObjectDynamicValue.h index 9f5304b55e934..2806857339efb 100644 --- a/lldb/include/lldb/Core/ValueObjectDynamicValue.h +++ b/lldb/include/lldb/Core/ValueObjectDynamicValue.h @@ -34,7 +34,7 @@ class ValueObjectDynamicValue : public ValueObject { public: ~ValueObjectDynamicValue() override; - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectMemory.h b/lldb/include/lldb/Core/ValueObjectMemory.h index d1cd6ae41445d..b5d5e6ecf4c0e 100644 --- a/lldb/include/lldb/Core/ValueObjectMemory.h +++ b/lldb/include/lldb/Core/ValueObjectMemory.h @@ -40,7 +40,7 @@ class ValueObjectMemory : public ValueObject { const Address &address, const CompilerType &ast_type); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectRegister.h b/lldb/include/lldb/Core/ValueObjectRegister.h index 41051d93b707e..3968584ad5185 100644 --- a/lldb/include/lldb/Core/ValueObjectRegister.h +++ b/lldb/include/lldb/Core/ValueObjectRegister.h @@ -36,7 +36,7 @@ class ValueObjectRegisterSet : public ValueObject { lldb::RegisterContextSP ®_ctx_sp, uint32_t set_idx); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override { return lldb::eValueTypeRegisterSet; @@ -86,7 +86,7 @@ class ValueObjectRegister : public ValueObject { lldb::RegisterContextSP ®_ctx_sp, uint32_t reg_num); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; lldb::ValueType GetValueType() const override { return lldb::eValueTypeRegister; diff --git a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h index cb471657aec9b..41c461ce13f0d 100644 --- a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h +++ b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h @@ -36,7 +36,7 @@ class ValueObjectSynthetic : public ValueObject { public: ~ValueObjectSynthetic() override; - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Core/ValueObjectVariable.h b/lldb/include/lldb/Core/ValueObjectVariable.h index b7e262574a14d..23fdedbf5a4a6 100644 --- a/lldb/include/lldb/Core/ValueObjectVariable.h +++ b/lldb/include/lldb/Core/ValueObjectVariable.h @@ -37,7 +37,7 @@ class ValueObjectVariable : public ValueObject { static lldb::ValueObjectSP Create(ExecutionContextScope *exe_scope, const lldb::VariableSP &var_sp); - uint64_t GetByteSize() override; + llvm::Optional GetByteSize() override; ConstString GetTypeName() override; diff --git a/lldb/include/lldb/Expression/ExpressionVariable.h b/lldb/include/lldb/Expression/ExpressionVariable.h index 60062d212badf..4259e6395da47 100644 --- a/lldb/include/lldb/Expression/ExpressionVariable.h +++ b/lldb/include/lldb/Expression/ExpressionVariable.h @@ -32,7 +32,7 @@ class ExpressionVariable virtual ~ExpressionVariable(); - size_t GetByteSize() { return m_frozen_sp->GetByteSize(); } + llvm::Optional GetByteSize() { return m_frozen_sp->GetByteSize(); } ConstString GetName() { return m_frozen_sp->GetName(); } diff --git a/lldb/include/lldb/Interpreter/CommandObjectRegexCommand.h b/lldb/include/lldb/Interpreter/CommandObjectRegexCommand.h index 01d7c6d118d46..cbd50511c483c 100644 --- a/lldb/include/lldb/Interpreter/CommandObjectRegexCommand.h +++ b/lldb/include/lldb/Interpreter/CommandObjectRegexCommand.h @@ -30,7 +30,7 @@ class CommandObjectRegexCommand : public CommandObjectRaw { bool IsRemovable() const override { return m_is_removable; } - bool AddRegexCommand(const char *re_cstr, const char *command_cstr); + bool AddRegexCommand(llvm::StringRef re_cstr, llvm::StringRef command_cstr); bool HasRegexEntries() const { return !m_entries.empty(); } diff --git a/lldb/include/lldb/Target/LanguageRuntime.h b/lldb/include/lldb/Target/LanguageRuntime.h index b0b9b919911a1..da3cb9702392d 100644 --- a/lldb/include/lldb/Target/LanguageRuntime.h +++ b/lldb/include/lldb/Target/LanguageRuntime.h @@ -18,6 +18,7 @@ #include "lldb/Expression/LLVMUserExpression.h" #include "lldb/Symbol/DeclVendor.h" #include "lldb/Target/ExecutionContextScope.h" +#include "lldb/Target/Runtime.h" #include "lldb/lldb-private.h" #include "lldb/lldb-public.h" @@ -56,7 +57,7 @@ class ExceptionSearchFilter : public SearchFilter { void UpdateModuleListIfNeeded(); }; -class LanguageRuntime : public PluginInterface { +class LanguageRuntime : public Runtime, public PluginInterface { public: ~LanguageRuntime() override; @@ -127,10 +128,6 @@ class LanguageRuntime : public PluginInterface { return lldb::ThreadSP(); } - Process *GetProcess() { return m_process; } - - Target &GetTargetRef() { return m_process->GetTarget(); } - virtual DeclVendor *GetDeclVendor() { return nullptr; } virtual lldb::BreakpointResolverSP @@ -159,7 +156,7 @@ class LanguageRuntime : public PluginInterface { return llvm::None; } - virtual void ModulesDidLoad(const ModuleList &module_list) {} + virtual void ModulesDidLoad(const ModuleList &module_list) override {} // Called by ClangExpressionParser::PrepareForExecution to query for any // custom LLVM IR passes that need to be run before an expression is @@ -179,10 +176,7 @@ class LanguageRuntime : public PluginInterface { static char ID; protected: - // Classes that inherit from LanguageRuntime can see and modify these - LanguageRuntime(Process *process); - Process *m_process; private: LanguageRuntime(const LanguageRuntime &) = delete; diff --git a/lldb/include/lldb/Target/Runtime.h b/lldb/include/lldb/Target/Runtime.h new file mode 100644 index 0000000000000..06f0b610e40ba --- /dev/null +++ b/lldb/include/lldb/Target/Runtime.h @@ -0,0 +1,33 @@ +//===-- Runtime.h -----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_TARGET_RUNTIME_H +#define LLDB_TARGET_RUNTIME_H + +#include "lldb/Target/Process.h" + +namespace lldb_private { +class Runtime { +public: + Runtime(Process *process) : m_process(process) {} + virtual ~Runtime() = default; + Runtime(const Runtime &) = delete; + const Runtime &operator=(const Runtime &) = delete; + + Process *GetProcess() { return m_process; } + Target &GetTargetRef() { return m_process->GetTarget(); } + + /// Called when modules have been loaded in the process. + virtual void ModulesDidLoad(const ModuleList &module_list) = 0; + +protected: + Process *m_process; +}; +} // namespace lldb_private + +#endif // LLDB_TARGET_RUNTIME_H diff --git a/lldb/include/lldb/Target/StackFrameRecognizer.h b/lldb/include/lldb/Target/StackFrameRecognizer.h index 302b56bec907b..baffc890bb065 100644 --- a/lldb/include/lldb/Target/StackFrameRecognizer.h +++ b/lldb/include/lldb/Target/StackFrameRecognizer.h @@ -154,7 +154,9 @@ class ValueObjectRecognizerSynthesizedValue : public ValueObject { SetName(parent.GetName()); } - uint64_t GetByteSize() override { return m_parent->GetByteSize(); } + llvm::Optional GetByteSize() override { + return m_parent->GetByteSize(); + } lldb::ValueType GetValueType() const override { return m_type; } bool UpdateValue() override { if (!m_parent->UpdateValueIfNeeded()) return false; diff --git a/lldb/include/lldb/Target/SystemRuntime.h b/lldb/include/lldb/Target/SystemRuntime.h index 4f07d7ab52e5b..0ec0793e95f9b 100644 --- a/lldb/include/lldb/Target/SystemRuntime.h +++ b/lldb/include/lldb/Target/SystemRuntime.h @@ -15,6 +15,7 @@ #include "lldb/Core/PluginInterface.h" #include "lldb/Target/QueueItem.h" #include "lldb/Target/QueueList.h" +#include "lldb/Target/Runtime.h" #include "lldb/Utility/ConstString.h" #include "lldb/Utility/StructuredData.h" #include "lldb/lldb-private.h" @@ -39,7 +40,7 @@ namespace lldb_private { /// can be asked to provide that information. /// -class SystemRuntime : public PluginInterface { +class SystemRuntime : public Runtime, public PluginInterface { public: /// Find a system runtime plugin for a given process. /// @@ -52,7 +53,7 @@ class SystemRuntime : public PluginInterface { static SystemRuntime *FindPlugin(Process *process); /// Construct with a process. - SystemRuntime(lldb_private::Process *process); + SystemRuntime(Process *process); /// Destructor. /// @@ -76,7 +77,7 @@ class SystemRuntime : public PluginInterface { /// /// Allow the SystemRuntime plugin to enable logging features in the system /// runtime libraries. - virtual void ModulesDidLoad(lldb_private::ModuleList &module_list); + virtual void ModulesDidLoad(const ModuleList &module_list) override; /// Called before detaching from a process. /// @@ -294,9 +295,6 @@ class SystemRuntime : public PluginInterface { } protected: - // Member variables. - Process *m_process; - std::vector m_types; private: diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h index 205a0d965c63a..066b8e1845c04 100644 --- a/lldb/include/lldb/Target/Thread.h +++ b/lldb/include/lldb/Target/Thread.h @@ -253,9 +253,9 @@ class Thread : public std::enable_shared_from_this, bool ThreadStoppedForAReason(); - static const char *RunModeAsCString(lldb::RunMode mode); + static std::string RunModeAsString(lldb::RunMode mode); - static const char *StopReasonAsCString(lldb::StopReason reason); + static std::string StopReasonAsString(lldb::StopReason reason); virtual const char *GetInfo() { return nullptr; } diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h index fb24c5a434792..118fdfd85fa9f 100644 --- a/lldb/include/lldb/Utility/RangeMap.h +++ b/lldb/include/lldb/Utility/RangeMap.h @@ -194,41 +194,25 @@ template class RangeVector { #ifdef ASSERT_RANGEMAP_ARE_SORTED assert(IsSorted()); #endif - // Can't combine if ranges if we have zero or one range - if (m_entries.size() > 1) { - // The list should be sorted prior to calling this function - typename Collection::iterator pos; - typename Collection::iterator end; - typename Collection::iterator prev; - bool can_combine = false; - // First we determine if we can combine any of the Entry objects so we - // don't end up allocating and making a new collection for no reason - for (pos = m_entries.begin(), end = m_entries.end(), prev = end; - pos != end; prev = pos++) { - if (prev != end && prev->DoesAdjoinOrIntersect(*pos)) { - can_combine = true; - break; - } - } + auto first_intersect = std::adjacent_find( + m_entries.begin(), m_entries.end(), [](const Entry &a, const Entry &b) { + return a.DoesAdjoinOrIntersect(b); + }); + if (first_intersect == m_entries.end()) + return; - // We we can combine at least one entry, then we make a new collection - // and populate it accordingly, and then swap it into place. - if (can_combine) { - Collection minimal_ranges; - for (pos = m_entries.begin(), end = m_entries.end(), prev = end; - pos != end; prev = pos++) { - if (prev != end && prev->DoesAdjoinOrIntersect(*pos)) - minimal_ranges.back().SetRangeEnd( - std::max(prev->GetRangeEnd(), pos->GetRangeEnd())); - else - minimal_ranges.push_back(*pos); - } - // Use the swap technique in case our new vector is much smaller. We - // must swap when using the STL because std::vector objects never - // release or reduce the memory once it has been allocated/reserved. - m_entries.swap(minimal_ranges); - } + // We we can combine at least one entry, then we make a new collection and + // populate it accordingly, and then swap it into place. + auto pos = std::next(first_intersect); + Collection minimal_ranges(m_entries.begin(), pos); + for (; pos != m_entries.end(); ++pos) { + Entry &back = minimal_ranges.back(); + if (back.DoesAdjoinOrIntersect(*pos)) + back.SetRangeEnd(std::max(back.GetRangeEnd(), pos->GetRangeEnd())); + else + minimal_ranges.push_back(*pos); } + m_entries.swap(minimal_ranges); } BaseType GetMinRangeBase(BaseType fail_value) const { @@ -353,6 +337,10 @@ template class RangeVector { return nullptr; } + using const_iterator = typename Collection::const_iterator; + const_iterator begin() const { return m_entries.begin(); } + const_iterator end() const { return m_entries.end(); } + protected: void CombinePrevAndNext(typename Collection::iterator pos) { // Check if the prev or next entries in case they need to be unioned with diff --git a/lldb/include/lldb/Utility/Scalar.h b/lldb/include/lldb/Utility/Scalar.h index 524b715230744..45ba7c012229b 100644 --- a/lldb/include/lldb/Utility/Scalar.h +++ b/lldb/include/lldb/Utility/Scalar.h @@ -60,41 +60,35 @@ class Scalar { }; // Constructors and Destructors - Scalar(); - Scalar(int v) : m_type(e_sint), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(int) * 8, v, true); - } - Scalar(unsigned int v) : m_type(e_uint), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(int) * 8, v); - } - Scalar(long v) : m_type(e_slong), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(long) * 8, v, true); - } - Scalar(unsigned long v) : m_type(e_ulong), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(long) * 8, v); - } - Scalar(long long v) : m_type(e_slonglong), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(long long) * 8, v, true); - } + Scalar() : m_type(e_void), m_float(0.0f) {} + Scalar(int v) + : m_type(e_sint), m_integer(sizeof(v) * 8, uint64_t(v), true), + m_float(0.0f) {} + Scalar(unsigned int v) + : m_type(e_uint), m_integer(sizeof(v) * 8, uint64_t(v), false), + m_float(0.0f) {} + Scalar(long v) + : m_type(e_slong), m_integer(sizeof(v) * 8, uint64_t(v), true), + m_float(0.0f) {} + Scalar(unsigned long v) + : m_type(e_ulong), m_integer(sizeof(v) * 8, uint64_t(v), false), + m_float(0.0f) {} + Scalar(long long v) + : m_type(e_slonglong), m_integer(sizeof(v) * 8, uint64_t(v), true), + m_float(0.0f) {} Scalar(unsigned long long v) - : m_type(e_ulonglong), m_float(static_cast(0)) { - m_integer = llvm::APInt(sizeof(long long) * 8, v); - } - Scalar(float v) : m_type(e_float), m_float(v) { m_float = llvm::APFloat(v); } - Scalar(double v) : m_type(e_double), m_float(v) { - m_float = llvm::APFloat(v); - } - Scalar(long double v) - : m_type(e_long_double), - m_float(llvm::APFloat::x87DoubleExtended(), - llvm::APInt(BITWIDTH_INT128, NUM_OF_WORDS_INT128, - (reinterpret_cast(&v))->x)) {} - Scalar(llvm::APInt v) : m_type(), m_float(static_cast(0)) { - m_integer = llvm::APInt(std::move(v)); - m_type = GetBestTypeForBitSize(m_integer.getBitWidth(), true); + : m_type(e_ulonglong), m_integer(sizeof(v) * 8, uint64_t(v), false), + m_float(0.0f) {} + Scalar(float v) : m_type(e_float), m_float(v) {} + Scalar(double v) : m_type(e_double), m_float(v) {} + Scalar(long double v) : m_type(e_long_double), m_float(double(v)) { + bool ignore; + m_float.convert(llvm::APFloat::x87DoubleExtended(), + llvm::APFloat::rmNearestTiesToEven, &ignore); } - // Scalar(const RegisterValue& reg_value); - virtual ~Scalar(); + Scalar(llvm::APInt v) + : m_type(GetBestTypeForBitSize(v.getBitWidth(), true)), + m_integer(std::move(v)), m_float(0.0f) {} /// Return the most efficient Scalar::Type for the requested bit size. static Type GetBestTypeForBitSize(size_t bit_size, bool sign); diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h index 0bf9e2c105377..6682413820d6b 100644 --- a/lldb/include/lldb/lldb-forward.h +++ b/lldb/include/lldb/lldb-forward.h @@ -192,7 +192,6 @@ class Status; class StopInfo; class Stoppoint; class StoppointCallbackContext; -class StoppointLocation; class Stream; class StreamFile; class StreamString; @@ -405,7 +404,6 @@ typedef std::shared_ptr typedef std::unique_ptr StackFrameRecognizerManagerUP; typedef std::shared_ptr StopInfoSP; -typedef std::shared_ptr StoppointLocationSP; typedef std::shared_ptr StreamSP; typedef std::weak_ptr StreamWP; typedef std::shared_ptr StreamFileSP; diff --git a/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/Makefile b/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/Makefile new file mode 100644 index 0000000000000..7b75ed77849f2 --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/Makefile @@ -0,0 +1,2 @@ +C_SOURCES := main.c func.c +include Makefile.rules diff --git a/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/TestGuiBasicDebug.py b/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/TestGuiBasicDebug.py new file mode 100644 index 0000000000000..54c763fd2afc2 --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/TestGuiBasicDebug.py @@ -0,0 +1,42 @@ +""" +Test the 'gui' shortcuts 's','n','f','u','d' (step in, step over, step out, up, down) +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test.lldbpexpect import PExpectTest + +class TestGuiBasicDebugCommandTest(PExpectTest): + + mydir = TestBase.compute_mydir(__file__) + + @skipIfCursesSupportMissing + def test_gui(self): + self.build() + + self.launch(executable=self.getBuildArtifact("a.out"), dimensions=(100,500)) + self.expect('br set -f main.c -p "// Break here"', substrs=["Breakpoint 1", "address ="]) + self.expect("run", substrs=["stop reason ="]) + + escape_key = chr(27).encode() + + # Start the GUI and close the welcome window. + self.child.sendline("gui") + self.child.send(escape_key) + + # Simulate a simple debugging session. + self.child.send("s") # step + self.child.expect("return 1; // In function[^\r\n]+<<< Thread 1: step in") + self.child.send("u") # up + self.child.expect_exact("func(); // Break here") + self.child.send("d") # down + self.child.expect_exact("return 1; // In function") + self.child.send("f") # finish + self.child.expect("func\(\); // Break here[^\r\n]+<<< Thread 1: step out") + self.child.send("s") # move onto the second one + self.child.expect("func\(\); // Second[^\r\n]+<<< Thread 1: step in") + self.child.send("n") # step over + self.child.expect("return 0;[^\r\n]+<<< Thread 1: step over") + + self.quit() diff --git a/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/func.c b/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/func.c new file mode 100644 index 0000000000000..f404a177af4f7 --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/func.c @@ -0,0 +1,3 @@ +int func() { + return 1; // In function +} diff --git a/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/main.c b/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/main.c new file mode 100644 index 0000000000000..f776fb99898de --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/commands/gui/basicdebug/main.c @@ -0,0 +1,7 @@ +extern int func(); + +int main(int argc, char **argv) { + func(); // Break here + func(); // Second + return 0; +} diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py index 8c8f2509a8639..873952e4c91ab 100644 --- a/lldb/packages/Python/lldbsuite/test/decorators.py +++ b/lldb/packages/Python/lldbsuite/test/decorators.py @@ -132,7 +132,7 @@ def wrapper(*args, **kwargs): if reason is not None: self.skipTest(reason) else: - func(*args, **kwargs) + return func(*args, **kwargs) return wrapper # Some decorators can be called both with no arguments (e.g. @expectedFailureWindows) diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index 4aad2867cf8f4..e4bf6eb569cf3 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -1608,7 +1608,7 @@ def yaml2obj(self, yaml_path, obj_path): """ yaml2obj_bin = configuration.get_yaml2obj_path() if not yaml2obj_bin: - self.assertTrue(False, "No valid FileCheck executable specified") + self.assertTrue(False, "No valid yaml2obj executable specified") command = [yaml2obj_bin, "-o=%s" % obj_path, yaml_path] system([command]) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py index c1b33c220b4bd..fa5a9c0db1ebd 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py @@ -53,11 +53,12 @@ def set_function_breakpoints(self, functions, condition=None, breakpoint_ids.append('%i' % (breakpoint['id'])) return breakpoint_ids - def waitUntil(self, condition): - while True: - if condition(): - break + def waitUntil(self, condition_callback): + for _ in range(20): + if condition_callback(): + return True time.sleep(0.5) + return False def verify_breakpoint_hit(self, breakpoint_ids): '''Wait for the process we are debugging to stop, and verify we hit diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt index ce6a7ec830fce..34f03e11c007a 100644 --- a/lldb/source/API/CMakeLists.txt +++ b/lldb/source/API/CMakeLists.txt @@ -126,9 +126,6 @@ if(PYTHON_RPATH) set_property(TARGET liblldb APPEND PROPERTY BUILD_RPATH "${PYTHON_RPATH}") endif() -if (MSVC) - set_source_files_properties(SBReproducer.cpp PROPERTIES COMPILE_FLAGS /bigobj) -endif() if(lldb_python_wrapper) add_dependencies(liblldb swig_wrapper) diff --git a/lldb/source/API/SBCommandInterpreterRunOptions.cpp b/lldb/source/API/SBCommandInterpreterRunOptions.cpp index fcfbf5e5401ae..da800e8b78046 100644 --- a/lldb/source/API/SBCommandInterpreterRunOptions.cpp +++ b/lldb/source/API/SBCommandInterpreterRunOptions.cpp @@ -24,8 +24,29 @@ SBCommandInterpreterRunOptions::SBCommandInterpreterRunOptions() { m_opaque_up = std::make_unique(); } +SBCommandInterpreterRunOptions::SBCommandInterpreterRunOptions( + const SBCommandInterpreterRunOptions &rhs) + : m_opaque_up() { + LLDB_RECORD_CONSTRUCTOR(SBCommandInterpreterRunOptions, + (const lldb::SBCommandInterpreterRunOptions &), rhs); + + m_opaque_up = std::make_unique(rhs.ref()); +} + SBCommandInterpreterRunOptions::~SBCommandInterpreterRunOptions() = default; +SBCommandInterpreterRunOptions &SBCommandInterpreterRunOptions::operator=( + const SBCommandInterpreterRunOptions &rhs) { + LLDB_RECORD_METHOD(lldb::SBCommandInterpreterRunOptions &, + SBCommandInterpreterRunOptions, operator=, + (const lldb::SBCommandInterpreterRunOptions &), rhs); + + if (this == &rhs) + return LLDB_RECORD_RESULT(*this); + *m_opaque_up = *rhs.m_opaque_up; + return LLDB_RECORD_RESULT(*this); +} + bool SBCommandInterpreterRunOptions::GetStopOnContinue() const { LLDB_RECORD_METHOD_CONST_NO_ARGS(bool, SBCommandInterpreterRunOptions, GetStopOnContinue); @@ -190,12 +211,11 @@ SBCommandInterpreterRunResult::~SBCommandInterpreterRunResult() = default; SBCommandInterpreterRunResult &SBCommandInterpreterRunResult::operator=( const SBCommandInterpreterRunResult &rhs) { LLDB_RECORD_METHOD(lldb::SBCommandInterpreterRunResult &, - SBCommandInterpreterRunResult, - operator=,(const lldb::SBCommandInterpreterRunResult &), - rhs); + SBCommandInterpreterRunResult, operator=, + (const lldb::SBCommandInterpreterRunResult &), rhs); if (this == &rhs) - return *this; + return LLDB_RECORD_RESULT(*this); *m_opaque_up = *rhs.m_opaque_up; return LLDB_RECORD_RESULT(*this); } @@ -220,6 +240,11 @@ namespace repro { template <> void RegisterMethods(Registry &R) { LLDB_REGISTER_CONSTRUCTOR(SBCommandInterpreterRunOptions, ()); + LLDB_REGISTER_CONSTRUCTOR(SBCommandInterpreterRunOptions, + (const lldb::SBCommandInterpreterRunOptions &)); + LLDB_REGISTER_METHOD(lldb::SBCommandInterpreterRunOptions &, + SBCommandInterpreterRunOptions, operator=, + (const lldb::SBCommandInterpreterRunOptions &)); LLDB_REGISTER_METHOD_CONST(bool, SBCommandInterpreterRunOptions, GetStopOnContinue, ()); LLDB_REGISTER_METHOD(void, SBCommandInterpreterRunOptions, SetStopOnContinue, @@ -260,8 +285,8 @@ template <> void RegisterMethods(Registry &R) { LLDB_REGISTER_CONSTRUCTOR(SBCommandInterpreterRunResult, (const lldb::SBCommandInterpreterRunResult &)); LLDB_REGISTER_METHOD(lldb::SBCommandInterpreterRunResult &, - SBCommandInterpreterRunResult, - operator=,(const lldb::SBCommandInterpreterRunResult &)); + SBCommandInterpreterRunResult, operator=, + (const lldb::SBCommandInterpreterRunResult &)); LLDB_REGISTER_METHOD_CONST(int, SBCommandInterpreterRunResult, GetNumberOfErrors, ()); LLDB_REGISTER_METHOD_CONST(lldb::CommandInterpreterResult, diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp index 7485b0ee1838e..686d1f23a75a8 100644 --- a/lldb/source/API/SBValue.cpp +++ b/lldb/source/API/SBValue.cpp @@ -333,7 +333,7 @@ size_t SBValue::GetByteSize() { ValueLocker locker; lldb::ValueObjectSP value_sp(GetSP(locker)); if (value_sp) { - result = value_sp->GetByteSize(); + result = value_sp->GetByteSize().getValueOr(0); } return result; diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp index 88c7f3352f426..9ed9549a789ea 100644 --- a/lldb/source/Breakpoint/Breakpoint.cpp +++ b/lldb/source/Breakpoint/Breakpoint.cpp @@ -51,7 +51,7 @@ Breakpoint::Breakpoint(Target &target, SearchFilterSP &filter_sp, : m_being_created(true), m_hardware(hardware), m_target(target), m_filter_sp(filter_sp), m_resolver_sp(resolver_sp), m_options_up(new BreakpointOptions(true)), m_locations(*this), - m_resolve_indirect_symbols(resolve_indirect_symbols), m_hit_count(0) { + m_resolve_indirect_symbols(resolve_indirect_symbols), m_hit_counter() { m_being_created = false; } @@ -61,7 +61,7 @@ Breakpoint::Breakpoint(Target &new_target, const Breakpoint &source_bp) m_options_up(new BreakpointOptions(*source_bp.m_options_up)), m_locations(*this), m_resolve_indirect_symbols(source_bp.m_resolve_indirect_symbols), - m_hit_count(0) {} + m_hit_counter() {} // Destructor Breakpoint::~Breakpoint() = default; @@ -341,7 +341,7 @@ bool Breakpoint::IgnoreCountShouldStop() { return true; } -uint32_t Breakpoint::GetHitCount() const { return m_hit_count; } +uint32_t Breakpoint::GetHitCount() const { return m_hit_counter.GetValue(); } bool Breakpoint::IsOneShot() const { return m_options_up->IsOneShot(); } diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp index 93d54c051ee5f..d3d6ea08bdb38 100644 --- a/lldb/source/Breakpoint/BreakpointLocation.cpp +++ b/lldb/source/Breakpoint/BreakpointLocation.cpp @@ -31,11 +31,10 @@ using namespace lldb_private; BreakpointLocation::BreakpointLocation(break_id_t loc_id, Breakpoint &owner, const Address &addr, lldb::tid_t tid, bool hardware, bool check_for_resolver) - : StoppointLocation(loc_id, addr.GetOpcodeLoadAddress(&owner.GetTarget()), - hardware), - m_being_created(true), m_should_resolve_indirect_functions(false), + : m_being_created(true), m_should_resolve_indirect_functions(false), m_is_reexported(false), m_is_indirect(false), m_address(addr), - m_owner(owner), m_options_up(), m_bp_site_sp(), m_condition_mutex() { + m_owner(owner), m_options_up(), m_bp_site_sp(), m_condition_mutex(), + m_condition_hash(0), m_loc_id(loc_id), m_hit_counter() { if (check_for_resolver) { Symbol *symbol = m_address.CalculateSymbolContextSymbol(); if (symbol && symbol->IsIndirect()) { @@ -332,7 +331,7 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx, return ret; } -uint32_t BreakpointLocation::GetIgnoreCount() { +uint32_t BreakpointLocation::GetIgnoreCount() const { return GetOptionsSpecifyingKind(BreakpointOptions::eIgnoreCount) ->GetIgnoreCount(); } @@ -417,16 +416,16 @@ bool BreakpointLocation::ShouldStop(StoppointCallbackContext *context) { void BreakpointLocation::BumpHitCount() { if (IsEnabled()) { // Step our hit count, and also step the hit count of the owner. - IncrementHitCount(); - m_owner.IncrementHitCount(); + m_hit_counter.Increment(); + m_owner.m_hit_counter.Increment(); } } void BreakpointLocation::UndoBumpHitCount() { if (IsEnabled()) { // Step our hit count, and also step the hit count of the owner. - DecrementHitCount(); - m_owner.DecrementHitCount(); + m_hit_counter.Decrement(); + m_owner.m_hit_counter.Decrement(); } } @@ -593,12 +592,15 @@ void BreakpointLocation::GetDescription(Stream *s, } } + bool is_resolved = IsResolved(); + bool is_hardware = is_resolved && m_bp_site_sp->IsHardware(); + if (level == lldb::eDescriptionLevelVerbose) { s->EOL(); s->Indent(); - s->Printf("resolved = %s\n", IsResolved() ? "true" : "false"); + s->Printf("resolved = %s\n", is_resolved ? "true" : "false"); s->Indent(); - s->Printf("hardware = %s\n", IsHardware() ? "true" : "false"); + s->Printf("hardware = %s\n", is_hardware ? "true" : "false"); s->Indent(); s->Printf("hit count = %-4u\n", GetHitCount()); @@ -609,8 +611,8 @@ void BreakpointLocation::GetDescription(Stream *s, } s->IndentLess(); } else if (level != eDescriptionLevelInitial) { - s->Printf(", %sresolved, %shit count = %u ", (IsResolved() ? "" : "un"), - (IsHardware() ? "hardware, " : ""), GetHitCount()); + s->Printf(", %sresolved, %shit count = %u ", (is_resolved ? "" : "un"), + (is_hardware ? "hardware, " : ""), GetHitCount()); if (m_options_up) { m_options_up->GetDescription(s, level); } @@ -621,6 +623,11 @@ void BreakpointLocation::Dump(Stream *s) const { if (s == nullptr) return; + bool is_resolved = IsResolved(); + bool is_hardware = is_resolved && m_bp_site_sp->IsHardware(); + auto hardware_index = is_resolved ? + m_bp_site_sp->GetHardwareIndex() : LLDB_INVALID_INDEX32; + lldb::tid_t tid = GetOptionsSpecifyingKind(BreakpointOptions::eThreadSpec) ->GetThreadSpecNoCreate()->GetTID(); s->Printf("BreakpointLocation %u: tid = %4.4" PRIx64 @@ -631,8 +638,7 @@ void BreakpointLocation::Dump(Stream *s) const { (m_options_up ? m_options_up->IsEnabled() : m_owner.IsEnabled()) ? "enabled " : "disabled", - IsHardware() ? "hardware" : "software", GetHardwareIndex(), - GetHitCount(), + is_hardware ? "hardware" : "software", hardware_index, GetHitCount(), GetOptionsSpecifyingKind(BreakpointOptions::eIgnoreCount) ->GetIgnoreCount()); } diff --git a/lldb/source/Breakpoint/BreakpointSite.cpp b/lldb/source/Breakpoint/BreakpointSite.cpp index a33fd0a1c462c..fb3f0cd068971 100644 --- a/lldb/source/Breakpoint/BreakpointSite.cpp +++ b/lldb/source/Breakpoint/BreakpointSite.cpp @@ -21,7 +21,7 @@ using namespace lldb_private; BreakpointSite::BreakpointSite(BreakpointSiteList *list, const BreakpointLocationSP &owner, lldb::addr_t addr, bool use_hardware) - : StoppointLocation(GetNextID(), addr, 0, use_hardware), + : StoppointSite(GetNextID(), addr, 0, use_hardware), m_type(eSoftware), // Process subclasses need to set this correctly using // SetType() m_saved_opcode(), m_trap_opcode(), @@ -48,7 +48,7 @@ break_id_t BreakpointSite::GetNextID() { // should continue. bool BreakpointSite::ShouldStop(StoppointCallbackContext *context) { - IncrementHitCount(); + m_hit_counter.Increment(); // ShouldStop can do a lot of work, and might even come come back and hit // this breakpoint site again. So don't hold the m_owners_mutex the whole // while. Instead make a local copy of the collection and call ShouldStop on @@ -156,51 +156,46 @@ void BreakpointSite::BumpHitCounts() { } } -void BreakpointSite::SetHardwareIndex(uint32_t index) { - std::lock_guard guard(m_owners_mutex); - for (BreakpointLocationSP loc_sp : m_owners.BreakpointLocations()) { - loc_sp->SetHardwareIndex(index); - } -} - bool BreakpointSite::IntersectsRange(lldb::addr_t addr, size_t size, lldb::addr_t *intersect_addr, size_t *intersect_size, size_t *opcode_offset) const { - // We only use software traps for software breakpoints - if (!IsHardware()) { - if (m_byte_size > 0) { - const lldb::addr_t bp_end_addr = m_addr + m_byte_size; - const lldb::addr_t end_addr = addr + size; - // Is the breakpoint end address before the passed in start address? - if (bp_end_addr <= addr) - return false; - // Is the breakpoint start address after passed in end address? - if (end_addr <= m_addr) - return false; - if (intersect_addr || intersect_size || opcode_offset) { - if (m_addr < addr) { - if (intersect_addr) - *intersect_addr = addr; - if (intersect_size) - *intersect_size = - std::min(bp_end_addr, end_addr) - addr; - if (opcode_offset) - *opcode_offset = addr - m_addr; - } else { - if (intersect_addr) - *intersect_addr = m_addr; - if (intersect_size) - *intersect_size = - std::min(bp_end_addr, end_addr) - m_addr; - if (opcode_offset) - *opcode_offset = 0; - } - } - return true; + // The function should be called only for software breakpoints. + lldbassert(GetType() == Type::eSoftware); + + if (m_byte_size == 0) + return false; + + const lldb::addr_t bp_end_addr = m_addr + m_byte_size; + const lldb::addr_t end_addr = addr + size; + // Is the breakpoint end address before the passed in start address? + if (bp_end_addr <= addr) + return false; + + // Is the breakpoint start address after passed in end address? + if (end_addr <= m_addr) + return false; + + if (intersect_addr || intersect_size || opcode_offset) { + if (m_addr < addr) { + if (intersect_addr) + *intersect_addr = addr; + if (intersect_size) + *intersect_size = + std::min(bp_end_addr, end_addr) - addr; + if (opcode_offset) + *opcode_offset = addr - m_addr; + } else { + if (intersect_addr) + *intersect_addr = m_addr; + if (intersect_size) + *intersect_size = + std::min(bp_end_addr, end_addr) - m_addr; + if (opcode_offset) + *opcode_offset = 0; } } - return false; + return true; } size_t diff --git a/lldb/source/Breakpoint/CMakeLists.txt b/lldb/source/Breakpoint/CMakeLists.txt index a7c0baf21af61..4862c2b364033 100644 --- a/lldb/source/Breakpoint/CMakeLists.txt +++ b/lldb/source/Breakpoint/CMakeLists.txt @@ -19,7 +19,7 @@ add_lldb_library(lldbBreakpoint BreakpointSiteList.cpp Stoppoint.cpp StoppointCallbackContext.cpp - StoppointLocation.cpp + StoppointSite.cpp Watchpoint.cpp WatchpointList.cpp WatchpointOptions.cpp diff --git a/lldb/source/Breakpoint/StoppointLocation.cpp b/lldb/source/Breakpoint/StoppointLocation.cpp deleted file mode 100644 index 5bb4c78548404..0000000000000 --- a/lldb/source/Breakpoint/StoppointLocation.cpp +++ /dev/null @@ -1,32 +0,0 @@ -//===-- StoppointLocation.cpp ---------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "lldb/Breakpoint/StoppointLocation.h" - - -using namespace lldb; -using namespace lldb_private; - -// StoppointLocation constructor -StoppointLocation::StoppointLocation(break_id_t bid, addr_t addr, bool hardware) - : m_loc_id(bid), m_addr(addr), m_hardware(hardware), - m_hardware_index(LLDB_INVALID_INDEX32), m_byte_size(0), m_hit_count(0) {} - -StoppointLocation::StoppointLocation(break_id_t bid, addr_t addr, - uint32_t byte_size, bool hardware) - : m_loc_id(bid), m_addr(addr), m_hardware(hardware), - m_hardware_index(LLDB_INVALID_INDEX32), m_byte_size(byte_size), - m_hit_count(0) {} - -// Destructor -StoppointLocation::~StoppointLocation() {} - -void StoppointLocation::DecrementHitCount() { - assert(m_hit_count > 0); - --m_hit_count; -} diff --git a/lldb/source/Breakpoint/StoppointSite.cpp b/lldb/source/Breakpoint/StoppointSite.cpp new file mode 100644 index 0000000000000..ba8c48326bdb6 --- /dev/null +++ b/lldb/source/Breakpoint/StoppointSite.cpp @@ -0,0 +1,23 @@ +//===-- StoppointSite.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Breakpoint/StoppointSite.h" + + +using namespace lldb; +using namespace lldb_private; + +StoppointSite::StoppointSite(break_id_t id, addr_t addr, bool hardware) + : m_id(id), m_addr(addr), m_is_hardware_required(hardware), + m_hardware_index(LLDB_INVALID_INDEX32), m_byte_size(0), m_hit_counter() {} + +StoppointSite::StoppointSite(break_id_t id, addr_t addr, + uint32_t byte_size, bool hardware) + : m_id(id), m_addr(addr), m_is_hardware_required(hardware), + m_hardware_index(LLDB_INVALID_INDEX32), m_byte_size(byte_size), + m_hit_counter() {} diff --git a/lldb/source/Breakpoint/Watchpoint.cpp b/lldb/source/Breakpoint/Watchpoint.cpp index df73c6a172303..29ae1ef3df26e 100644 --- a/lldb/source/Breakpoint/Watchpoint.cpp +++ b/lldb/source/Breakpoint/Watchpoint.cpp @@ -25,7 +25,7 @@ using namespace lldb_private; Watchpoint::Watchpoint(Target &target, lldb::addr_t addr, uint32_t size, const CompilerType *type, bool hardware) - : StoppointLocation(0, addr, size, hardware), m_target(target), + : StoppointSite(0, addr, size, hardware), m_target(target), m_enabled(false), m_is_hardware(hardware), m_is_watch_variable(false), m_is_ephemeral(false), m_disabled_count(0), m_watch_read(0), m_watch_write(0), m_watch_was_read(0), m_watch_was_written(0), @@ -93,9 +93,10 @@ void Watchpoint::SetWatchSpec(const std::string &str) { m_watch_spec_str = str; } -// Override default impl of StoppointLocation::IsHardware() since m_is_hardware -// member field is more accurate. -bool Watchpoint::IsHardware() const { return m_is_hardware; } +bool Watchpoint::IsHardware() const { + lldbassert(m_is_hardware || !HardwareRequired()); + return m_is_hardware; +} bool Watchpoint::IsWatchVariable() const { return m_is_watch_variable; } @@ -123,12 +124,12 @@ bool Watchpoint::CaptureWatchedValue(const ExecutionContext &exe_ctx) { void Watchpoint::IncrementFalseAlarmsAndReviseHitCount() { ++m_false_alarms; if (m_false_alarms) { - if (m_hit_count >= m_false_alarms) { - m_hit_count -= m_false_alarms; + if (m_hit_counter.GetValue() >= m_false_alarms) { + m_hit_counter.Decrement(m_false_alarms); m_false_alarms = 0; } else { - m_false_alarms -= m_hit_count; - m_hit_count = 0; + m_false_alarms -= m_hit_counter.GetValue(); + m_hit_counter.Reset(); } } } @@ -137,7 +138,7 @@ void Watchpoint::IncrementFalseAlarmsAndReviseHitCount() { // should continue. bool Watchpoint::ShouldStop(StoppointCallbackContext *context) { - IncrementHitCount(); + m_hit_counter.Increment(); return IsEnabled(); } diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp index 255fbe53fb2ea..eaf22344fafa2 100644 --- a/lldb/source/Commands/CommandObjectCommands.cpp +++ b/lldb/source/Commands/CommandObjectCommands.cpp @@ -970,7 +970,7 @@ a number follows 'f':" std::string subst(std::string(regex_sed.substr( second_separator_char_pos + 1, third_separator_char_pos - second_separator_char_pos - 1))); - m_regex_cmd_up->AddRegexCommand(regex.c_str(), subst.c_str()); + m_regex_cmd_up->AddRegexCommand(regex, subst); } return error; } diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index f86779d85b5fa..fd8d38e856372 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -48,19 +48,19 @@ class CommandObjectProcessLaunchOrAttach : public CommandObjectParsed { state = process->GetState(); if (process->IsAlive() && state != eStateConnected) { - char message[1024]; + std::string message; if (process->GetState() == eStateAttaching) - ::snprintf(message, sizeof(message), - "There is a pending attach, abort it and %s?", - m_new_process_action.c_str()); + message = + llvm::formatv("There is a pending attach, abort it and {0}?", + m_new_process_action); else if (process->GetShouldDetach()) - ::snprintf(message, sizeof(message), - "There is a running process, detach from it and %s?", - m_new_process_action.c_str()); + message = llvm::formatv( + "There is a running process, detach from it and {0}?", + m_new_process_action); else - ::snprintf(message, sizeof(message), - "There is a running process, kill it and %s?", - m_new_process_action.c_str()); + message = + llvm::formatv("There is a running process, kill it and {0}?", + m_new_process_action); if (!m_interpreter.Confirm(message, true)) { result.SetStatus(eReturnStatusFailed); diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 0a14a25107a46..84b7c988353a0 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -23,6 +23,7 @@ #include "lldb/Interpreter/OptionGroupBoolean.h" #include "lldb/Interpreter/OptionGroupFile.h" #include "lldb/Interpreter/OptionGroupFormat.h" +#include "lldb/Interpreter/OptionGroupPlatform.h" #include "lldb/Interpreter/OptionGroupString.h" #include "lldb/Interpreter/OptionGroupUInt64.h" #include "lldb/Interpreter/OptionGroupUUID.h" @@ -206,8 +207,6 @@ class OptionGroupDependents : public OptionGroup { #pragma mark CommandObjectTargetCreate -// "target create" - class CommandObjectTargetCreate : public CommandObjectParsed { public: CommandObjectTargetCreate(CommandInterpreter &interpreter) @@ -216,11 +215,9 @@ class CommandObjectTargetCreate : public CommandObjectParsed { "Create a target using the argument as the main executable.", nullptr), m_option_group(), m_arch_option(), + m_platform_options(true), // Include the --platform option. m_core_file(LLDB_OPT_SET_1, false, "core", 'c', 0, eArgTypeFilename, "Fullpath to a core file to use for this target."), - m_platform_path(LLDB_OPT_SET_1, false, "platform-path", 'P', 0, - eArgTypePath, - "Path to the remote file to use for this target."), m_symbol_file(LLDB_OPT_SET_1, false, "symfile", 's', 0, eArgTypeFilename, "Fullpath to a stand alone debug " @@ -245,8 +242,8 @@ class CommandObjectTargetCreate : public CommandObjectParsed { m_arguments.push_back(arg); m_option_group.Append(&m_arch_option, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1); + m_option_group.Append(&m_platform_options, LLDB_OPT_SET_ALL, 1); m_option_group.Append(&m_core_file, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1); - m_option_group.Append(&m_platform_path, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1); m_option_group.Append(&m_symbol_file, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1); m_option_group.Append(&m_remote_file, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1); m_option_group.Append(&m_add_dependents, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1); @@ -317,7 +314,8 @@ class CommandObjectTargetCreate : public CommandObjectParsed { llvm::StringRef arch_cstr = m_arch_option.GetArchitectureName(); Status error(debugger.GetTargetList().CreateTarget( debugger, file_path, arch_cstr, - m_add_dependents.m_load_dependent_files, nullptr, target_sp)); + m_add_dependents.m_load_dependent_files, &m_platform_options, + target_sp)); if (target_sp) { // Only get the platform after we create the target because we might @@ -448,8 +446,8 @@ class CommandObjectTargetCreate : public CommandObjectParsed { private: OptionGroupOptions m_option_group; OptionGroupArchitecture m_arch_option; + OptionGroupPlatform m_platform_options; OptionGroupFile m_core_file; - OptionGroupFile m_platform_path; OptionGroupFile m_symbol_file; OptionGroupFile m_remote_file; OptionGroupDependents m_add_dependents; @@ -457,8 +455,6 @@ class CommandObjectTargetCreate : public CommandObjectParsed { #pragma mark CommandObjectTargetList -// "target list" - class CommandObjectTargetList : public CommandObjectParsed { public: CommandObjectTargetList(CommandInterpreter &interpreter) @@ -490,8 +486,6 @@ class CommandObjectTargetList : public CommandObjectParsed { #pragma mark CommandObjectTargetSelect -// "target select" - class CommandObjectTargetSelect : public CommandObjectParsed { public: CommandObjectTargetSelect(CommandInterpreter &interpreter) @@ -551,8 +545,6 @@ class CommandObjectTargetSelect : public CommandObjectParsed { #pragma mark CommandObjectTargetDelete -// "target delete" - class CommandObjectTargetDelete : public CommandObjectParsed { public: CommandObjectTargetDelete(CommandInterpreter &interpreter) @@ -697,8 +689,6 @@ class CommandObjectTargetShowLaunchEnvironment : public CommandObjectParsed { #pragma mark CommandObjectTargetVariable -// "target variable" - class CommandObjectTargetVariable : public CommandObjectParsed { static const uint32_t SHORT_OPTION_FILE = 0x66696c65; // 'file' static const uint32_t SHORT_OPTION_SHLB = 0x73686c62; // 'shlb' diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp index ce4662930a7c2..c2a008af79d6f 100644 --- a/lldb/source/Commands/CommandObjectWatchpoint.cpp +++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp @@ -905,7 +905,7 @@ corresponding to the byte size of the data type."); // We're in business. // Find out the size of this variable. size = m_option_watchpoint.watch_size == 0 - ? valobj_sp->GetByteSize() + ? valobj_sp->GetByteSize().getValueOr(0) : m_option_watchpoint.watch_size; } compiler_type = valobj_sp->GetCompilerType(); diff --git a/lldb/source/Core/Communication.cpp b/lldb/source/Core/Communication.cpp index b358e70b1a91e..b50cd0ecab5ce 100644 --- a/lldb/source/Core/Communication.cpp +++ b/lldb/source/Core/Communication.cpp @@ -199,9 +199,8 @@ bool Communication::StartReadThread(Status *error_ptr) { LLDB_LOG(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_COMMUNICATION), "{0} Communication::StartReadThread ()", this); - char thread_name[1024]; - snprintf(thread_name, sizeof(thread_name), "", - GetBroadcasterName().AsCString()); + const std::string thread_name = + llvm::formatv("", GetBroadcasterName()); m_read_thread_enabled = true; m_read_thread_did_exit = false; @@ -340,7 +339,7 @@ lldb::thread_result_t Communication::ReadThread(lldb::thread_arg_t p) { } if (error.Fail()) LLDB_LOG(log, "error: {0}, status = {1}", error, - Communication::ConnectionStatusAsCString(status)); + Communication::ConnectionStatusAsString(status)); break; case eConnectionStatusInterrupted: // Synchronization signal from // SynchronizeWithReadThread() @@ -356,7 +355,7 @@ lldb::thread_result_t Communication::ReadThread(lldb::thread_arg_t p) { case eConnectionStatusTimedOut: // Request timed out if (error.Fail()) LLDB_LOG(log, "error: {0}, status = {1}", error, - Communication::ConnectionStatusAsCString(status)); + Communication::ConnectionStatusAsString(status)); break; } } @@ -417,8 +416,8 @@ void Communication::SetConnection(std::unique_ptr connection) { m_connection_sp = std::move(connection); } -const char * -Communication::ConnectionStatusAsCString(lldb::ConnectionStatus status) { +std::string +Communication::ConnectionStatusAsString(lldb::ConnectionStatus status) { switch (status) { case eConnectionStatusSuccess: return "success"; @@ -436,8 +435,5 @@ Communication::ConnectionStatusAsCString(lldb::ConnectionStatus status) { return "interrupted"; } - static char unknown_state_string[64]; - snprintf(unknown_state_string, sizeof(unknown_state_string), - "ConnectionStatus = %i", status); - return unknown_state_string; + return "@" + std::to_string(status); } diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 5f4f1e266d81c..05cfac19915e1 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -666,9 +666,7 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton) m_event_handler_thread(), m_io_handler_thread(), m_sync_broadcaster(nullptr, "lldb.debugger.sync"), m_forward_listener_sp(), m_clear_once() { - char instance_cstr[256]; - snprintf(instance_cstr, sizeof(instance_cstr), "debugger_%d", (int)GetID()); - m_instance_name.SetCString(instance_cstr); + m_instance_name.SetString(llvm::formatv("debugger_{0}", GetID()).str()); if (log_callback) m_log_callback_stream_sp = std::make_shared(log_callback, baton); diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp index de4bd2a79095a..144b2112183c2 100644 --- a/lldb/source/Core/IOHandlerCursesGUI.cpp +++ b/lldb/source/Core/IOHandlerCursesGUI.cpp @@ -916,9 +916,9 @@ void Menu::DrawMenuTitle(Window &window, bool highlight) { } else { const int shortcut_key = m_key_value; bool underlined_shortcut = false; - const attr_t hilgight_attr = A_REVERSE; + const attr_t highlight_attr = A_REVERSE; if (highlight) - window.AttributeOn(hilgight_attr); + window.AttributeOn(highlight_attr); if (llvm::isPrint(shortcut_key)) { size_t lower_pos = m_name.find(tolower(shortcut_key)); size_t upper_pos = m_name.find(toupper(shortcut_key)); @@ -945,7 +945,7 @@ void Menu::DrawMenuTitle(Window &window, bool highlight) { } if (highlight) - window.AttributeOff(hilgight_attr); + window.AttributeOff(highlight_attr); if (m_key_name.empty()) { if (!underlined_shortcut && llvm::isPrint(m_key_value)) { @@ -1287,6 +1287,10 @@ class Application { update = true; break; case eKeyNotHandled: + if (ch == 12) { // Ctrl+L, force full redraw + redrawwin(m_window_sp->get()); + update = true; + } break; case eQuitApplication: done = true; @@ -2829,7 +2833,8 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { eMenuID_Process, eMenuID_ProcessAttach, - eMenuID_ProcessDetach, + eMenuID_ProcessDetachResume, + eMenuID_ProcessDetachSuspended, eMenuID_ProcessLaunch, eMenuID_ProcessContinue, eMenuID_ProcessHalt, @@ -2974,13 +2979,15 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate { } return MenuActionResult::Handled; - case eMenuID_ProcessDetach: { + case eMenuID_ProcessDetachResume: + case eMenuID_ProcessDetachSuspended: { ExecutionContext exe_ctx = m_debugger.GetCommandInterpreter().GetExecutionContext(); if (exe_ctx.HasProcessScope()) { Process *process = exe_ctx.GetProcessPtr(); if (process && process->IsAlive()) - process->Detach(false); + process->Detach(menu.GetIdentifier() == + eMenuID_ProcessDetachSuspended); } } return MenuActionResult::Handled; @@ -3233,15 +3240,15 @@ class SourceFileWindowDelegate : public WindowDelegate { {KEY_NPAGE, "Page down"}, {'b', "Set breakpoint on selected source/disassembly line"}, {'c', "Continue process"}, - {'d', "Detach and resume process"}, {'D', "Detach with process suspended"}, {'h', "Show help dialog"}, - {'k', "Kill process"}, {'n', "Step over (source line)"}, {'N', "Step over (single instruction)"}, - {'o', "Step out"}, + {'f', "Step out (finish)"}, {'s', "Step in (source line)"}, {'S', "Step in (single instruction)"}, + {'u', "Frame up"}, + {'d', "Frame down"}, {',', "Page up"}, {'.', "Page down"}, {'\0', nullptr}}; @@ -3798,26 +3805,15 @@ class SourceFileWindowDelegate : public WindowDelegate { } return eKeyHandled; - case 'd': // 'd' == detach and let run case 'D': // 'D' == detach and keep stopped { ExecutionContext exe_ctx = m_debugger.GetCommandInterpreter().GetExecutionContext(); if (exe_ctx.HasProcessScope()) - exe_ctx.GetProcessRef().Detach(c == 'D'); + exe_ctx.GetProcessRef().Detach(true); } return eKeyHandled; - case 'k': - // 'k' == kill - { - ExecutionContext exe_ctx = - m_debugger.GetCommandInterpreter().GetExecutionContext(); - if (exe_ctx.HasProcessScope()) - exe_ctx.GetProcessRef().Destroy(false); - } - return eKeyHandled; - case 'c': // 'c' == continue { @@ -3828,8 +3824,8 @@ class SourceFileWindowDelegate : public WindowDelegate { } return eKeyHandled; - case 'o': - // 'o' == step out + case 'f': + // 'f' == step out (finish) { ExecutionContext exe_ctx = m_debugger.GetCommandInterpreter().GetExecutionContext(); @@ -3866,6 +3862,26 @@ class SourceFileWindowDelegate : public WindowDelegate { } return eKeyHandled; + case 'u': // 'u' == frame up + case 'd': // 'd' == frame down + { + ExecutionContext exe_ctx = + m_debugger.GetCommandInterpreter().GetExecutionContext(); + if (exe_ctx.HasThreadScope()) { + Thread *thread = exe_ctx.GetThreadPtr(); + uint32_t frame_idx = thread->GetSelectedFrameIndex(); + if (frame_idx == UINT32_MAX) + frame_idx = 0; + if (c == 'u' && frame_idx + 1 < thread->GetStackFrameCount()) + ++frame_idx; + else if (c == 'd' && frame_idx > 0) + --frame_idx; + if (thread->SetSelectedFrameByIndex(frame_idx, true)) + exe_ctx.SetFrameSP(thread->GetSelectedFrame()); + } + } + return eKeyHandled; + case 'h': window.CreateHelpSubwindow(); return eKeyHandled; @@ -3937,8 +3953,12 @@ void IOHandlerCursesGUI::Activate() { ApplicationDelegate::eMenuID_Process)); process_menu_sp->AddSubmenu(MenuSP(new Menu( "Attach", nullptr, 'a', ApplicationDelegate::eMenuID_ProcessAttach))); - process_menu_sp->AddSubmenu(MenuSP(new Menu( - "Detach", nullptr, 'd', ApplicationDelegate::eMenuID_ProcessDetach))); + process_menu_sp->AddSubmenu( + MenuSP(new Menu("Detach and resume", nullptr, 'd', + ApplicationDelegate::eMenuID_ProcessDetachResume))); + process_menu_sp->AddSubmenu( + MenuSP(new Menu("Detach suspended", nullptr, 's', + ApplicationDelegate::eMenuID_ProcessDetachSuspended))); process_menu_sp->AddSubmenu(MenuSP(new Menu( "Launch", nullptr, 'l', ApplicationDelegate::eMenuID_ProcessLaunch))); process_menu_sp->AddSubmenu(MenuSP(new Menu(Menu::Type::Separator))); diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp index 7414dd281d43a..e79fcb48742d8 100644 --- a/lldb/source/Core/SourceManager.cpp +++ b/lldb/source/Core/SourceManager.cpp @@ -183,14 +183,14 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile( break; } - char prefix[32] = ""; + std::string prefix; if (bp_locs) { uint32_t bp_count = bp_locs->NumLineEntriesWithLine(line); if (bp_count > 0) - ::snprintf(prefix, sizeof(prefix), "[%u] ", bp_count); + prefix = llvm::formatv("[{0}]", bp_count); else - ::snprintf(prefix, sizeof(prefix), " "); + prefix = " "; } char buffer[3]; @@ -206,7 +206,8 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile( .str()); } - s->Printf("%s%s %-4u\t", prefix, current_line_highlight.c_str(), line); + s->Printf("%s%s %-4u\t", prefix.c_str(), current_line_highlight.c_str(), + line); // So far we treated column 0 as a special 'no column value', but // DisplaySourceLines starts counting columns from 0 (and no column is diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp index 3a775b07e5e1f..78711c4e42ce2 100644 --- a/lldb/source/Core/ValueObject.cpp +++ b/lldb/source/Core/ValueObject.cpp @@ -849,7 +849,7 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) { uint64_t count = 0; const Encoding encoding = GetCompilerType().GetEncoding(count); - const size_t byte_size = GetByteSize(); + const size_t byte_size = GetByteSize().getValueOr(0); Value::ValueType value_type = m_value.GetValueType(); @@ -1524,7 +1524,7 @@ bool ValueObject::SetValueFromCString(const char *value_str, Status &error) { uint64_t count = 0; const Encoding encoding = GetCompilerType().GetEncoding(count); - const size_t byte_size = GetByteSize(); + const size_t byte_size = GetByteSize().getValueOr(0); Value::ValueType value_type = m_value.GetValueType(); @@ -1702,8 +1702,7 @@ ValueObjectSP ValueObject::GetSyntheticArrayMember(size_t index, bool can_create) { ValueObjectSP synthetic_child_sp; if (IsPointerType() || IsArrayType()) { - char index_str[64]; - snprintf(index_str, sizeof(index_str), "[%" PRIu64 "]", (uint64_t)index); + std::string index_str = llvm::formatv("[{0}]", index); ConstString index_const_str(index_str); // Check if we have already created a synthetic array member in this valid // object. If we have we will re-use it. @@ -1730,8 +1729,7 @@ ValueObjectSP ValueObject::GetSyntheticBitFieldChild(uint32_t from, uint32_t to, bool can_create) { ValueObjectSP synthetic_child_sp; if (IsScalarType()) { - char index_str[64]; - snprintf(index_str, sizeof(index_str), "[%i-%i]", from, to); + std::string index_str = llvm::formatv("[{0}-{1}]", from, to); ConstString index_const_str(index_str); // Check if we have already created a synthetic array member in this valid // object. If we have we will re-use it. @@ -1741,13 +1739,13 @@ ValueObjectSP ValueObject::GetSyntheticBitFieldChild(uint32_t from, uint32_t to, uint32_t bit_field_offset = from; if (GetDataExtractor().GetByteOrder() == eByteOrderBig) bit_field_offset = - GetByteSize() * 8 - bit_field_size - bit_field_offset; + GetByteSize().getValueOr(0) * 8 - bit_field_size - bit_field_offset; // We haven't made a synthetic array member for INDEX yet, so lets make // one and cache it for any future reference. ValueObjectChild *synthetic_child = new ValueObjectChild( - *this, GetCompilerType(), index_const_str, GetByteSize(), 0, - bit_field_size, bit_field_offset, false, false, eAddressTypeInvalid, - 0); + *this, GetCompilerType(), index_const_str, + GetByteSize().getValueOr(0), 0, bit_field_size, bit_field_offset, + false, false, eAddressTypeInvalid, 0); // Cache the value if we got one back... if (synthetic_child) { @@ -1768,9 +1766,7 @@ ValueObjectSP ValueObject::GetSyntheticChildAtOffset( ValueObjectSP synthetic_child_sp; if (name_const_str.IsEmpty()) { - char name_str[64]; - snprintf(name_str, sizeof(name_str), "@%i", offset); - name_const_str.SetCString(name_str); + name_const_str.SetString("@" + std::to_string(offset)); } // Check if we have already created a synthetic array member in this valid diff --git a/lldb/source/Core/ValueObjectCast.cpp b/lldb/source/Core/ValueObjectCast.cpp index 22e856be539b5..7b6d3591faf44 100644 --- a/lldb/source/Core/ValueObjectCast.cpp +++ b/lldb/source/Core/ValueObjectCast.cpp @@ -47,7 +47,7 @@ size_t ValueObjectCast::CalculateNumChildren(uint32_t max) { return children_count <= max ? children_count : max; } -uint64_t ValueObjectCast::GetByteSize() { +llvm::Optional ValueObjectCast::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); return m_value.GetValueByteSize(nullptr, &exe_ctx); } diff --git a/lldb/source/Core/ValueObjectChild.cpp b/lldb/source/Core/ValueObjectChild.cpp index 6205ed32c615a..28cb49328f34e 100644 --- a/lldb/source/Core/ValueObjectChild.cpp +++ b/lldb/source/Core/ValueObjectChild.cpp @@ -57,15 +57,8 @@ size_t ValueObjectChild::CalculateNumChildren(uint32_t max) { static void AdjustForBitfieldness(ConstString &name, uint8_t bitfield_bit_size) { - if (name && bitfield_bit_size) { - const char *compiler_type_name = name.AsCString(); - if (compiler_type_name) { - std::vector bitfield_type_name(strlen(compiler_type_name) + 32, 0); - ::snprintf(&bitfield_type_name.front(), bitfield_type_name.size(), - "%s:%u", compiler_type_name, bitfield_bit_size); - name.SetCString(&bitfield_type_name.front()); - } - } + if (name && bitfield_bit_size) + name.SetString(llvm::formatv("{0}:{1}", name, bitfield_bit_size).str()); } ConstString ValueObjectChild::GetTypeName() { diff --git a/lldb/source/Core/ValueObjectConstResult.cpp b/lldb/source/Core/ValueObjectConstResult.cpp index 8d84f8e62ccc5..ceb4491f86663 100644 --- a/lldb/source/Core/ValueObjectConstResult.cpp +++ b/lldb/source/Core/ValueObjectConstResult.cpp @@ -40,8 +40,7 @@ ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ByteOrder byte_order, uint32_t addr_byte_size, lldb::addr_t address) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this, address) { + : ValueObject(exe_scope, manager), m_impl(this, address) { SetIsConstant(); SetValueIsValid(true); m_data.SetByteOrder(byte_order); @@ -64,8 +63,7 @@ ValueObjectConstResult::ValueObjectConstResult( ExecutionContextScope *exe_scope, ValueObjectManager &manager, const CompilerType &compiler_type, ConstString name, const DataExtractor &data, lldb::addr_t address) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this, address) { + : ValueObject(exe_scope, manager), m_impl(this, address) { m_data = data; if (!m_data.GetSharedDataBuffer()) { @@ -112,8 +110,7 @@ ValueObjectConstResult::ValueObjectConstResult( const CompilerType &compiler_type, ConstString name, const lldb::DataBufferSP &data_sp, lldb::ByteOrder data_byte_order, uint32_t data_addr_size, lldb::addr_t address) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this, address) { + : ValueObject(exe_scope, manager), m_impl(this, address) { m_data.SetByteOrder(data_byte_order); m_data.SetAddressByteSize(data_addr_size); m_data.SetData(data_sp); @@ -143,7 +140,7 @@ ValueObjectConstResult::ValueObjectConstResult( ExecutionContextScope *exe_scope, ValueObjectManager &manager, const CompilerType &compiler_type, ConstString name, lldb::addr_t address, AddressType address_type, uint32_t addr_byte_size) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), + : ValueObject(exe_scope, manager), m_type_name(), m_impl(this, address) { m_value.GetScalar() = address; m_data.SetAddressByteSize(addr_byte_size); @@ -179,8 +176,7 @@ ValueObjectSP ValueObjectConstResult::Create(ExecutionContextScope *exe_scope, ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, const Status &error) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this) { + : ValueObject(exe_scope, manager), m_impl(this) { m_error = error; SetIsConstant(); } @@ -189,8 +185,7 @@ ValueObjectConstResult::ValueObjectConstResult(ExecutionContextScope *exe_scope, ValueObjectManager &manager, const Value &value, ConstString name, Module *module) - : ValueObject(exe_scope, manager), m_type_name(), m_byte_size(0), - m_impl(this) { + : ValueObject(exe_scope, manager), m_impl(this) { m_value = value; m_name = name; ExecutionContext exe_ctx; @@ -208,9 +203,9 @@ lldb::ValueType ValueObjectConstResult::GetValueType() const { return eValueTypeConstResult; } -uint64_t ValueObjectConstResult::GetByteSize() { +llvm::Optional ValueObjectConstResult::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); - if (m_byte_size == 0) { + if (!m_byte_size) { if (auto size = GetCompilerType().GetByteSize(exe_ctx.GetBestExecutionContextScope())) SetByteSize(*size); diff --git a/lldb/source/Core/ValueObjectDynamicValue.cpp b/lldb/source/Core/ValueObjectDynamicValue.cpp index ca66740cb55d4..1c25b8c85a059 100644 --- a/lldb/source/Core/ValueObjectDynamicValue.cpp +++ b/lldb/source/Core/ValueObjectDynamicValue.cpp @@ -98,7 +98,7 @@ size_t ValueObjectDynamicValue::CalculateNumChildren(uint32_t max) { return m_parent->GetNumChildren(max); } -uint64_t ValueObjectDynamicValue::GetByteSize() { +llvm::Optional ValueObjectDynamicValue::GetByteSize() { const bool success = UpdateValueIfNeeded(false); if (success && m_dynamic_type_info.HasType()) { ExecutionContext exe_ctx(GetExecutionContextRef()); diff --git a/lldb/source/Core/ValueObjectMemory.cpp b/lldb/source/Core/ValueObjectMemory.cpp index 8e7d3ebc93f69..17fade9e5fdc3 100644 --- a/lldb/source/Core/ValueObjectMemory.cpp +++ b/lldb/source/Core/ValueObjectMemory.cpp @@ -139,13 +139,11 @@ size_t ValueObjectMemory::CalculateNumChildren(uint32_t max) { return child_count <= max ? child_count : max; } -uint64_t ValueObjectMemory::GetByteSize() { +llvm::Optional ValueObjectMemory::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); if (m_type_sp) - return m_type_sp->GetByteSize(exe_ctx.GetBestExecutionContextScope()) - .getValueOr(0); - return m_compiler_type.GetByteSize(exe_ctx.GetBestExecutionContextScope()) - .getValueOr(0); + return m_type_sp->GetByteSize(exe_ctx.GetBestExecutionContextScope()); + return m_compiler_type.GetByteSize(exe_ctx.GetBestExecutionContextScope()); } lldb::ValueType ValueObjectMemory::GetValueType() const { diff --git a/lldb/source/Core/ValueObjectRegister.cpp b/lldb/source/Core/ValueObjectRegister.cpp index ec87c38fb3679..27461e9cebc41 100644 --- a/lldb/source/Core/ValueObjectRegister.cpp +++ b/lldb/source/Core/ValueObjectRegister.cpp @@ -81,7 +81,7 @@ size_t ValueObjectRegisterSet::CalculateNumChildren(uint32_t max) { return 0; } -uint64_t ValueObjectRegisterSet::GetByteSize() { return 0; } +llvm::Optional ValueObjectRegisterSet::GetByteSize() { return 0; } bool ValueObjectRegisterSet::UpdateValue() { m_error.Clear(); @@ -229,7 +229,9 @@ size_t ValueObjectRegister::CalculateNumChildren(uint32_t max) { return children_count <= max ? children_count : max; } -uint64_t ValueObjectRegister::GetByteSize() { return m_reg_info.byte_size; } +llvm::Optional ValueObjectRegister::GetByteSize() { + return m_reg_info.byte_size; +} bool ValueObjectRegister::UpdateValue() { m_error.Clear(); diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp index 32d1e6ab8368c..fb2d32e602cea 100644 --- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp +++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp @@ -121,7 +121,9 @@ bool ValueObjectSynthetic::MightHaveChildren() { return (m_might_have_children != eLazyBoolNo); } -uint64_t ValueObjectSynthetic::GetByteSize() { return m_parent->GetByteSize(); } +llvm::Optional ValueObjectSynthetic::GetByteSize() { + return m_parent->GetByteSize(); +} lldb::ValueType ValueObjectSynthetic::GetValueType() const { return m_parent->GetValueType(); diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp index 0d1e7b047a0ac..ab67e3038cf0a 100644 --- a/lldb/source/Core/ValueObjectVariable.cpp +++ b/lldb/source/Core/ValueObjectVariable.cpp @@ -105,15 +105,15 @@ size_t ValueObjectVariable::CalculateNumChildren(uint32_t max) { return child_count <= max ? child_count : max; } -uint64_t ValueObjectVariable::GetByteSize() { +llvm::Optional ValueObjectVariable::GetByteSize() { ExecutionContext exe_ctx(GetExecutionContextRef()); CompilerType type(GetCompilerType()); if (!type.IsValid()) - return 0; + return {}; - return type.GetByteSize(exe_ctx.GetBestExecutionContextScope()).getValueOr(0); + return type.GetByteSize(exe_ctx.GetBestExecutionContextScope()); } lldb::ValueType ValueObjectVariable::GetValueType() const { diff --git a/lldb/source/Expression/ExpressionVariable.cpp b/lldb/source/Expression/ExpressionVariable.cpp index d95f0745cf4ba..8b3dda7b2fe10 100644 --- a/lldb/source/Expression/ExpressionVariable.cpp +++ b/lldb/source/Expression/ExpressionVariable.cpp @@ -16,10 +16,10 @@ using namespace lldb_private; ExpressionVariable::~ExpressionVariable() {} uint8_t *ExpressionVariable::GetValueBytes() { - const size_t byte_size = m_frozen_sp->GetByteSize(); - if (byte_size > 0) { - if (m_frozen_sp->GetDataExtractor().GetByteSize() < byte_size) { - m_frozen_sp->GetValue().ResizeData(byte_size); + llvm::Optional byte_size = m_frozen_sp->GetByteSize(); + if (byte_size && *byte_size) { + if (m_frozen_sp->GetDataExtractor().GetByteSize() < *byte_size) { + m_frozen_sp->GetValue().ResizeData(*byte_size); m_frozen_sp->GetValue().GetData(m_frozen_sp->GetDataExtractor()); } return const_cast( diff --git a/lldb/source/Expression/Materializer.cpp b/lldb/source/Expression/Materializer.cpp index 6f8d9b154570a..327e15a26266f 100644 --- a/lldb/source/Expression/Materializer.cpp +++ b/lldb/source/Expression/Materializer.cpp @@ -67,7 +67,7 @@ class EntityPersistentVariable : public Materializer::Entity { const bool zero_memory = false; lldb::addr_t mem = map.Malloc( - m_persistent_variable_sp->GetByteSize(), 8, + m_persistent_variable_sp->GetByteSize().getValueOr(0), 8, lldb::ePermissionsReadable | lldb::ePermissionsWritable, IRMemoryMap::eAllocationPolicyMirror, zero_memory, allocate_error); @@ -106,7 +106,8 @@ class EntityPersistentVariable : public Materializer::Entity { Status write_error; map.WriteMemory(mem, m_persistent_variable_sp->GetValueBytes(), - m_persistent_variable_sp->GetByteSize(), write_error); + m_persistent_variable_sp->GetByteSize().getValueOr(0), + write_error); if (!write_error.Success()) { err.SetErrorStringWithFormat( @@ -234,7 +235,7 @@ class EntityPersistentVariable : public Materializer::Entity { map.GetBestExecutionContextScope(), m_persistent_variable_sp.get()->GetCompilerType(), m_persistent_variable_sp->GetName(), location, eAddressTypeLoad, - m_persistent_variable_sp->GetByteSize()); + m_persistent_variable_sp->GetByteSize().getValueOr(0)); if (frame_top != LLDB_INVALID_ADDRESS && frame_bottom != LLDB_INVALID_ADDRESS && location >= frame_bottom && @@ -279,7 +280,8 @@ class EntityPersistentVariable : public Materializer::Entity { LLDB_LOGF(log, "Dematerializing %s from 0x%" PRIx64 " (size = %llu)", m_persistent_variable_sp->GetName().GetCString(), (uint64_t)mem, - (unsigned long long)m_persistent_variable_sp->GetByteSize()); + (unsigned long long)m_persistent_variable_sp->GetByteSize() + .getValueOr(0)); // Read the contents of the spare memory area @@ -288,7 +290,7 @@ class EntityPersistentVariable : public Materializer::Entity { Status read_error; map.ReadMemory(m_persistent_variable_sp->GetValueBytes(), mem, - m_persistent_variable_sp->GetByteSize(), read_error); + m_persistent_variable_sp->GetByteSize().getValueOr(0), read_error); if (!read_error.Success()) { err.SetErrorStringWithFormat( @@ -369,10 +371,11 @@ class EntityPersistentVariable : public Materializer::Entity { if (!err.Success()) { dump_stream.Printf(" \n"); } else { - DataBufferHeap data(m_persistent_variable_sp->GetByteSize(), 0); + DataBufferHeap data( + m_persistent_variable_sp->GetByteSize().getValueOr(0), 0); map.ReadMemory(data.GetBytes(), target_address, - m_persistent_variable_sp->GetByteSize(), err); + m_persistent_variable_sp->GetByteSize().getValueOr(0), err); if (!err.Success()) { dump_stream.Printf(" \n"); @@ -621,8 +624,8 @@ class EntityVariable : public Materializer::Entity { Status extract_error; - map.GetMemoryData(data, m_temporary_allocation, valobj_sp->GetByteSize(), - extract_error); + map.GetMemoryData(data, m_temporary_allocation, + valobj_sp->GetByteSize().getValueOr(0), extract_error); if (!extract_error.Success()) { err.SetErrorStringWithFormat("couldn't get the data for variable %s", @@ -919,7 +922,7 @@ class EntityResultVariable : public Materializer::Entity { ret->ValueUpdated(); - const size_t pvar_byte_size = ret->GetByteSize(); + const size_t pvar_byte_size = ret->GetByteSize().getValueOr(0); uint8_t *pvar_data = ret->GetValueBytes(); map.ReadMemory(pvar_data, address, pvar_byte_size, read_error); diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp index 226e638aba250..49b7a38d8dae4 100644 --- a/lldb/source/Host/common/Editline.cpp +++ b/lldb/source/Host/common/Editline.cpp @@ -48,9 +48,12 @@ int setupterm(char *term, int fildes, int *errret); // understand the relationship between DisplayInput(), MoveCursor(), // SetCurrentLine(), and SaveEditedLine() before making changes. +/// https://www.ecma-international.org/publications/files/ECMA-ST/Ecma-048.pdf #define ESCAPE "\x1b" +/// Faint, decreased intensity or second colour. #define ANSI_FAINT ESCAPE "[2m" -#define ANSI_UNFAINT ESCAPE "[22m" +/// Normal colour or normal intensity (neither bold nor faint). +#define ANSI_UNFAINT ESCAPE "[0m" #define ANSI_CLEAR_BELOW ESCAPE "[J" #define ANSI_CLEAR_RIGHT ESCAPE "[K" #define ANSI_SET_COLUMN_N ESCAPE "[%dG" diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 50a69b29260ca..4786e4602e4b4 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -631,15 +631,10 @@ void CommandInterpreter::LoadCommandDictionary() { if (tbreak_regex_cmd_up) { bool success = true; for (size_t i = 0; i < num_regexes; i++) { - // If you add a resultant command string longer than 1024 characters be - // sure to increase the size of this buffer. - char buffer[1024]; - int num_printed = - snprintf(buffer, 1024, "%s %s", break_regexes[i][1], "-o 1"); - lldbassert(num_printed < 1024); - UNUSED_IF_ASSERT_DISABLED(num_printed); + std::string command = break_regexes[i][1]; + command += " -o 1"; success = - tbreak_regex_cmd_up->AddRegexCommand(break_regexes[i][0], buffer); + tbreak_regex_cmd_up->AddRegexCommand(break_regexes[i][0], command); if (!success) break; } @@ -1987,10 +1982,7 @@ void CommandInterpreter::BuildAliasCommandArgs(CommandObject *alias_cmd_obj, if (value_type != OptionParser::eOptionalArgument) new_args.AppendArgument(value); else { - char buffer[255]; - ::snprintf(buffer, sizeof(buffer), "%s%s", option.c_str(), - value.c_str()); - new_args.AppendArgument(llvm::StringRef(buffer)); + new_args.AppendArgument(option + value); } } else if (static_cast(index) >= cmd_args.GetArgumentCount()) { @@ -2012,10 +2004,7 @@ void CommandInterpreter::BuildAliasCommandArgs(CommandObject *alias_cmd_obj, if (value_type != OptionParser::eOptionalArgument) new_args.AppendArgument(cmd_args.GetArgumentAtIndex(index)); else { - char buffer[255]; - ::snprintf(buffer, sizeof(buffer), "%s%s", option.c_str(), - cmd_args.GetArgumentAtIndex(index)); - new_args.AppendArgument(buffer); + new_args.AppendArgument(option + cmd_args.GetArgumentAtIndex(index)); } used[index] = true; } diff --git a/lldb/source/Interpreter/CommandObjectRegexCommand.cpp b/lldb/source/Interpreter/CommandObjectRegexCommand.cpp index 5a0265e58c5c6..7485fd76cc25f 100644 --- a/lldb/source/Interpreter/CommandObjectRegexCommand.cpp +++ b/lldb/source/Interpreter/CommandObjectRegexCommand.cpp @@ -69,14 +69,13 @@ bool CommandObjectRegexCommand::DoExecute(llvm::StringRef command, return false; } -bool CommandObjectRegexCommand::AddRegexCommand(const char *re_cstr, - const char *command_cstr) { +bool CommandObjectRegexCommand::AddRegexCommand(llvm::StringRef re_cstr, + llvm::StringRef command_cstr) { m_entries.resize(m_entries.size() + 1); // Only add the regular expression if it compiles - m_entries.back().regex = - RegularExpression(llvm::StringRef::withNullAsEmpty(re_cstr)); + m_entries.back().regex = RegularExpression(re_cstr); if (m_entries.back().regex.IsValid()) { - m_entries.back().command.assign(command_cstr); + m_entries.back().command = command_cstr.str(); return true; } // The regex didn't compile... diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp index 648fc4adf24fd..ca4f7332f2588 100644 --- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp +++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp @@ -72,6 +72,9 @@ bool lldb_private::formatters::NSBundleSummaryProvider( valobj.GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeObjCID), true)); + if (!text) + return false; + StreamString summary_stream; bool was_nsstring_ok = NSStringSummaryProvider(*text, summary_stream, options); @@ -117,6 +120,10 @@ bool lldb_private::formatters::NSTimeZoneSummaryProvider( uint64_t offset = ptr_size; ValueObjectSP text(valobj.GetSyntheticChildAtOffset( offset, valobj.GetCompilerType(), true)); + + if (!text) + return false; + StreamString summary_stream; bool was_nsstring_ok = NSStringSummaryProvider(*text, summary_stream, options); @@ -162,6 +169,10 @@ bool lldb_private::formatters::NSNotificationSummaryProvider( uint64_t offset = ptr_size; ValueObjectSP text(valobj.GetSyntheticChildAtOffset( offset, valobj.GetCompilerType(), true)); + + if (!text) + return false; + StreamString summary_stream; bool was_nsstring_ok = NSStringSummaryProvider(*text, summary_stream, options); diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index ab1a6a8bb5f3e..338c798e6cef6 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -1990,6 +1990,8 @@ static bool ParseTrieEntries(DataExtractor &data, lldb::offset_t offset, if (e.entry.flags & EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER) { e.entry.other = data.GetULEB128(&offset); uint64_t resolver_addr = e.entry.other; + if (text_seg_base_addr != LLDB_INVALID_ADDRESS) + resolver_addr += text_seg_base_addr; if (is_arm) resolver_addr &= THUMB_ADDRESS_BIT_MASK; resolver_addresses.insert(resolver_addr); diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp index bd0a231303bd1..0160fb95c58a9 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp @@ -253,3 +253,11 @@ CoreSimulatorSupport::Device PlatformAppleSimulator::GetSimulatorDevice() { return CoreSimulatorSupport::Device(); } #endif + +bool PlatformAppleSimulator::GetSupportedArchitectureAtIndex(uint32_t idx, + ArchSpec &arch) { + if (idx >= m_supported_triples.size()) + return false; + arch = ArchSpec(m_supported_triples[idx]); + return true; +} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h index 8c0174f2946ed..6182acaf229ac 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.h @@ -44,6 +44,9 @@ class PlatformAppleSimulator : public PlatformDarwin { lldb_private::Target *target, lldb_private::Status &error) override; + bool GetSupportedArchitectureAtIndex(uint32_t idx, + lldb_private::ArchSpec &arch) override; + protected: std::mutex m_core_sim_path_mutex; llvm::Optional m_core_simulator_framework_path; @@ -52,6 +55,9 @@ class PlatformAppleSimulator : public PlatformDarwin { lldb_private::FileSpec GetCoreSimulatorPath(); + llvm::Triple::OSType m_os_type = llvm::Triple::UnknownOS; + llvm::ArrayRef m_supported_triples = {}; + void LoadCoreSimulator(); #if defined(__APPLE__) diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp index 461624a2adaa8..27f798b00ebf4 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp @@ -77,6 +77,7 @@ PlatformSP PlatformAppleTVSimulator::CreateInstance(bool force, bool create = force; if (!create && arch && arch->IsValid()) { switch (arch->GetMachine()) { + case llvm::Triple::aarch64: case llvm::Triple::x86_64: { const llvm::Triple &triple = arch->GetTriple(); switch (triple.getVendor()) { @@ -144,7 +145,24 @@ const char *PlatformAppleTVSimulator::GetDescriptionStatic() { /// Default Constructor PlatformAppleTVSimulator::PlatformAppleTVSimulator() : PlatformAppleSimulator( - CoreSimulatorSupport::DeviceType::ProductFamilyID::appleTV) {} + CoreSimulatorSupport::DeviceType::ProductFamilyID::appleTV) { +#ifdef __APPLE__ +#if __arm64__ + static const llvm::StringRef supported_triples[] = { + "arm64e-apple-tvos-simulator", + "arm64-apple-tvos-simulator", + "x86_64h-apple-tvos-simulator", + "x86_64-apple-tvos-simulator", + }; +#else + static const llvm::StringRef supported_triples[] = { + "x86_64h-apple-tvos-simulator", + "x86_64-apple-tvos-simulator", + }; +#endif + m_supported_triples = supported_triples; +#endif +} /// Destructor. /// @@ -322,19 +340,3 @@ uint32_t PlatformAppleTVSimulator::FindProcesses( } return process_infos.size(); } - -bool PlatformAppleTVSimulator::GetSupportedArchitectureAtIndex(uint32_t idx, - ArchSpec &arch) { - static const ArchSpec platform_arch( - HostInfo::GetArchitecture(HostInfo::eArchKind64)); - - if (idx == 0) { - arch = platform_arch; - if (arch.IsValid()) { - arch.GetTriple().setOS(llvm::Triple::TvOS); - arch.GetTriple().setEnvironment(llvm::Triple::Simulator); - return true; - } - } - return false; -} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h index 5a7b0ee0d7dc9..a94f94f9f57f7 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h @@ -62,9 +62,6 @@ class PlatformAppleTVSimulator : public PlatformAppleSimulator { FindProcesses(const lldb_private::ProcessInstanceInfoMatch &match_info, lldb_private::ProcessInstanceInfoList &process_infos) override; - bool GetSupportedArchitectureAtIndex(uint32_t idx, - lldb_private::ArchSpec &arch) override; - void AddClangModuleCompilationOptions(lldb_private::Target *target, std::vector &options) override { diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp index 03a8fcd313602..79f254c43a6ae 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp @@ -76,6 +76,7 @@ PlatformSP PlatformAppleWatchSimulator::CreateInstance(bool force, bool create = force; if (!create && arch && arch->IsValid()) { switch (arch->GetMachine()) { + case llvm::Triple::aarch64: case llvm::Triple::x86_64: case llvm::Triple::x86: { const llvm::Triple &triple = arch->GetTriple(); @@ -145,7 +146,23 @@ const char *PlatformAppleWatchSimulator::GetDescriptionStatic() { /// Default Constructor PlatformAppleWatchSimulator::PlatformAppleWatchSimulator() : PlatformAppleSimulator( - CoreSimulatorSupport::DeviceType::ProductFamilyID::appleWatch) {} + CoreSimulatorSupport::DeviceType::ProductFamilyID::appleWatch) { +#ifdef __APPLE__ +#if __arm64__ + static const llvm::StringRef supported_triples[] = { + "arm64e-apple-watchos-simulator", + "arm64-apple-watchos-simulator", + }; +#else + static const llvm::StringRef supported_triples[] = { + "x86_64-apple-watchos-simulator", + "x86_64h-apple-watchos-simulator", + "i386-apple-watchos-simulator", + }; +#endif + m_supported_triples = supported_triples; +#endif +} /// Destructor. /// @@ -325,24 +342,3 @@ uint32_t PlatformAppleWatchSimulator::FindProcesses( return process_infos.size(); } -bool PlatformAppleWatchSimulator::GetSupportedArchitectureAtIndex( - uint32_t idx, ArchSpec &arch) { - if (idx == 0) { - arch = HostInfo::GetArchitecture(HostInfo::eArchKind32); - if (arch.IsValid()) { - arch.GetTriple().setOS(llvm::Triple::WatchOS); - arch.GetTriple().setEnvironment(llvm::Triple::Simulator); - return true; - } - } - - if (idx == 1) { - arch = HostInfo::GetArchitecture(HostInfo::eArchKind64); - if (arch.IsValid()) { - arch.GetTriple().setOS(llvm::Triple::WatchOS); - arch.GetTriple().setEnvironment(llvm::Triple::Simulator); - return true; - } - } - return false; -} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h index 96dcd16ffa993..78b936691b0c7 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h @@ -62,9 +62,6 @@ class PlatformAppleWatchSimulator : public PlatformAppleSimulator { FindProcesses(const lldb_private::ProcessInstanceInfoMatch &match_info, lldb_private::ProcessInstanceInfoList &process_infos) override; - bool GetSupportedArchitectureAtIndex(uint32_t idx, - lldb_private::ArchSpec &arch) override; - void AddClangModuleCompilationOptions(lldb_private::Target *target, std::vector &options) override { diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp index a890d0afdf1e3..b73c06fcdc8b4 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp @@ -76,6 +76,7 @@ PlatformSP PlatformiOSSimulator::CreateInstance(bool force, bool create = force; if (!create && arch && arch->IsValid()) { switch (arch->GetMachine()) { + case llvm::Triple::aarch64: case llvm::Triple::x86_64: case llvm::Triple::x86: { const llvm::Triple &triple = arch->GetTriple(); @@ -148,7 +149,25 @@ const char *PlatformiOSSimulator::GetDescriptionStatic() { /// Default Constructor PlatformiOSSimulator::PlatformiOSSimulator() : PlatformAppleSimulator( - CoreSimulatorSupport::DeviceType::ProductFamilyID::iPhone) {} + CoreSimulatorSupport::DeviceType::ProductFamilyID::iPhone) { +#ifdef __APPLE__ +#if __arm64__ + static const llvm::StringRef supported_triples[] = { + "arm64e-apple-ios-simulator", + "arm64-apple-ios-simulator", + "x86_64-apple-ios-simulator", + "x86_64h-apple-ios-simulator", + }; +#else + static const llvm::StringRef supported_triples[] = { + "x86_64h-apple-ios-simulator", + "x86_64-apple-ios-simulator", + "i386-apple-ios-simulator", + }; +#endif + m_supported_triples = supported_triples; +#endif +} /// Destructor. /// @@ -328,43 +347,3 @@ PlatformiOSSimulator::FindProcesses(const ProcessInstanceInfoMatch &match_info, return process_infos.size(); } -bool PlatformiOSSimulator::GetSupportedArchitectureAtIndex(uint32_t idx, - ArchSpec &arch) { - static const ArchSpec platform_arch( - HostInfo::GetArchitecture(HostInfo::eArchKindDefault)); - static const ArchSpec platform_arch64( - HostInfo::GetArchitecture(HostInfo::eArchKind64)); - - if (idx == 0) { - arch = platform_arch; - if (arch.IsValid()) { - arch.GetTriple().setOS(llvm::Triple::IOS); - arch.GetTriple().setEnvironment(llvm::Triple::Simulator); - return true; - } - } else { - if (platform_arch.IsExactMatch(platform_arch64)) { - // This macosx platform supports both 32 and 64 bit. - if (idx == 1) { - // 32/64: return "x86_64-apple-macosx" for architecture 1 - arch = platform_arch64; - return true; - } else if (idx == 2 || idx == 3) { - arch = HostInfo::GetArchitecture(HostInfo::eArchKind32); - if (arch.IsValid()) { - if (idx == 2) - arch.GetTriple().setOS(llvm::Triple::IOS); - // 32/64: return "i386-apple-ios" for architecture 2 32/64: return - // "i386-apple-macosx" for architecture 3 - return true; - } - } - } else if (idx == 1) { - // This macosx platform supports only 32 bit, so return the *-apple- - // macosx version - arch = platform_arch; - return true; - } - } - return false; -} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h index 4d416d759bd2d..982f8e2de5e7a 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h @@ -64,9 +64,6 @@ class PlatformiOSSimulator : public PlatformAppleSimulator { FindProcesses(const lldb_private::ProcessInstanceInfoMatch &match_info, lldb_private::ProcessInstanceInfoList &process_infos) override; - bool GetSupportedArchitectureAtIndex(uint32_t idx, - lldb_private::ArchSpec &arch) override; - void AddClangModuleCompilationOptions(lldb_private::Target *target, std::vector &options) override { diff --git a/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp index a44080640f6c4..f1a424ccbca57 100644 --- a/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp +++ b/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp @@ -683,6 +683,9 @@ ProcessFreeBSD::GetSoftwareBreakpointTrapOpcode(BreakpointSite *bp_site) { } Status ProcessFreeBSD::EnableBreakpointSite(BreakpointSite *bp_site) { + if (bp_site->HardwareRequired()) + return Status("Hardware breakpoints are not supported."); + return EnableSoftwareBreakpoint(bp_site); } diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.cpp index 06190d0c036de..dc283fce81046 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/CommunicationKDP.cpp @@ -194,12 +194,11 @@ size_t CommunicationKDP::WaitForPacketWithTimeoutMicroSecondsNoLock( : std::chrono::microseconds(timeout_usec), status, &error); - LLDB_LOGV(log, - "Read (buffer, sizeof(buffer), timeout_usec = 0x{0:x}, " - "status = {1}, error = {2}) => bytes_read = {4}", - timeout_usec, - Communication::ConnectionStatusAsCString(status), - error, bytes_read); + LLDB_LOGV(log, + "Read (buffer, sizeof(buffer), timeout_usec = 0x{0:x}, " + "status = {1}, error = {2}) => bytes_read = {4}", + timeout_usec, Communication::ConnectionStatusAsString(status), + error, bytes_read); if (bytes_read > 0) { if (CheckForPacket(buffer, bytes_read, packet)) diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp index dde25184a8c59..6e394eac6f9e6 100644 --- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp +++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp @@ -644,6 +644,9 @@ Status ProcessKDP::DoDeallocateMemory(lldb::addr_t addr) { } Status ProcessKDP::EnableBreakpointSite(BreakpointSite *bp_site) { + if (bp_site->HardwareRequired()) + return Status("Hardware breakpoints are not supported."); + if (m_comm.LocalBreakpointsAreSupported()) { Status error; if (!bp_site->IsEnabled()) { diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp index 8a85c8ba6f4e4..07a81cdf69ccd 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessDebugger.cpp @@ -227,22 +227,20 @@ Status ProcessDebugger::DestroyProcess(const lldb::StateType state) { debugger_thread = m_session_data->m_debugger; } - Status error; - if (state != eStateExited && state != eStateDetached) { - LLDB_LOG( - log, "Shutting down process {0}.", - debugger_thread->GetProcess().GetNativeProcess().GetSystemHandle()); - error = debugger_thread->StopDebugging(true); - - // By the time StopDebugging returns, there is no more debugger thread, so - // we can be assured that no other thread will race for the session data. - m_session_data.reset(); - } else { - error.SetErrorStringWithFormat("cannot destroy process %" PRIx64 - " while state = %d", - GetDebuggedProcessId(), state); - LLDB_LOG(log, "error: {0}", error); + if (state == eStateExited || state == eStateDetached) { + LLDB_LOG(log, "warning: cannot destroy process {0} while state = {1}.", + GetDebuggedProcessId(), state); + return Status(); } + + LLDB_LOG(log, "Shutting down process {0}.", + debugger_thread->GetProcess().GetNativeProcess().GetSystemHandle()); + auto error = debugger_thread->StopDebugging(true); + + // By the time StopDebugging returns, there is no more debugger thread, so + // we can be assured that no other thread will race for the session data. + m_session_data.reset(); + return error; } diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp index 7b020f55e9938..96e2603b993ea 100644 --- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp @@ -150,6 +150,9 @@ lldb_private::ConstString ProcessWindows::GetPluginName() { uint32_t ProcessWindows::GetPluginVersion() { return 1; } Status ProcessWindows::EnableBreakpointSite(BreakpointSite *bp_site) { + if (bp_site->HardwareRequired()) + return Status("Hardware breakpoints are not supported."); + Log *log = ProcessWindowsLog::GetLogIfAny(WINDOWS_LOG_BREAKPOINTS); LLDB_LOG(log, "bp_site = {0:x}, id={1}, addr={2:x}", bp_site, bp_site->GetID(), bp_site->GetLoadAddress()); diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp index 6a60502416959..832760f7f0dcc 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp @@ -284,7 +284,7 @@ GDBRemoteCommunication::WaitForPacketNoLock(StringExtractorGDBRemote &packet, LLDB_LOGV(log, "Read(buffer, sizeof(buffer), timeout = {0}, " "status = {1}, error = {2}) => bytes_read = {3}", - timeout, Communication::ConnectionStatusAsCString(status), error, + timeout, Communication::ConnectionStatusAsString(status), error, bytes_read); if (bytes_read > 0) { diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 1fed8e0642670..8dea8b9809854 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -3204,14 +3204,8 @@ Status ProcessGDBRemote::DisableBreakpointSite(BreakpointSite *bp_site) { break; case BreakpointSite::eExternal: { - GDBStoppointType stoppoint_type; - if (bp_site->IsHardware()) - stoppoint_type = eBreakpointHardware; - else - stoppoint_type = eBreakpointSoftware; - - if (m_gdb_comm.SendGDBStoppointTypePacket(stoppoint_type, false, addr, - bp_op_size)) + if (m_gdb_comm.SendGDBStoppointTypePacket(eBreakpointSoftware, false, + addr, bp_op_size)) error.SetErrorToGenericError(); } break; } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 35e7c34734e2c..7e3628504727a 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -1641,33 +1641,6 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename, *unique_ast_entry_up); - if (attrs.is_forward_declaration && die.HasChildren()) { - // Check to see if the DIE actually has a definition, some version of - // GCC will - // emit DIEs with DW_AT_declaration set to true, but yet still have - // subprogram, members, or inheritance, so we can't trust it - DWARFDIE child_die = die.GetFirstChild(); - while (child_die) { - switch (child_die.Tag()) { - case DW_TAG_inheritance: - case DW_TAG_subprogram: - case DW_TAG_member: - case DW_TAG_APPLE_property: - case DW_TAG_class_type: - case DW_TAG_structure_type: - case DW_TAG_enumeration_type: - case DW_TAG_typedef: - case DW_TAG_union_type: - child_die.Clear(); - attrs.is_forward_declaration = false; - break; - default: - child_die = child_die.GetSibling(); - break; - } - } - } - if (!attrs.is_forward_declaration) { // Always start the definition for a class type so that if the class // has child classes or types that require the class to be created diff --git a/lldb/source/Target/LanguageRuntime.cpp b/lldb/source/Target/LanguageRuntime.cpp index 58ad70c2b9028..0bbb9660f7412 100644 --- a/lldb/source/Target/LanguageRuntime.cpp +++ b/lldb/source/Target/LanguageRuntime.cpp @@ -202,24 +202,19 @@ class ExceptionBreakpointResolver : public BreakpointResolver { LanguageRuntime *LanguageRuntime::FindPlugin(Process *process, lldb::LanguageType language) { - std::unique_ptr language_runtime_up; LanguageRuntimeCreateInstance create_callback; - for (uint32_t idx = 0; (create_callback = PluginManager::GetLanguageRuntimeCreateCallbackAtIndex(idx)) != nullptr; ++idx) { - language_runtime_up.reset(create_callback(process, language)); - - if (language_runtime_up) - return language_runtime_up.release(); + if (LanguageRuntime *runtime = create_callback(process, language)) + return runtime; } - return nullptr; } -LanguageRuntime::LanguageRuntime(Process *process) : m_process(process) {} +LanguageRuntime::LanguageRuntime(Process *process) : Runtime(process) {} LanguageRuntime::~LanguageRuntime() = default; diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index b7694ec43f341..36a2930f79150 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -2291,6 +2291,9 @@ size_t Process::WriteMemory(addr_t addr, const void *buf, size_t size, if (error.Fail()) return; + if (bp->GetType() != BreakpointSite::eSoftware) + return; + addr_t intersect_addr; size_t intersect_size; size_t opcode_offset; @@ -5757,41 +5760,25 @@ addr_t Process::ResolveIndirectFunction(const Address *address, Status &error) { } void Process::ModulesDidLoad(ModuleList &module_list) { + // Inform the system runtime of the modified modules. SystemRuntime *sys_runtime = GetSystemRuntime(); - if (sys_runtime) { + if (sys_runtime) sys_runtime->ModulesDidLoad(module_list); - } GetJITLoaders().ModulesDidLoad(module_list); - // Give runtimes a chance to be created. + // Give the instrumentation runtimes a chance to be created before informing + // them of the modified modules. InstrumentationRuntime::ModulesDidLoad(module_list, this, m_instrumentation_runtimes); + for (auto &runtime : m_instrumentation_runtimes) + runtime.second->ModulesDidLoad(module_list); - // Tell runtimes about new modules. - for (auto pos = m_instrumentation_runtimes.begin(); - pos != m_instrumentation_runtimes.end(); ++pos) { - InstrumentationRuntimeSP runtime = pos->second; - runtime->ModulesDidLoad(module_list); - } - - // Let any language runtimes we have already created know about the modules - // that loaded. - - // Iterate over a copy of this language runtime list in case the language - // runtime ModulesDidLoad somehow causes the language runtime to be - // unloaded. - { - std::lock_guard guard(m_language_runtimes_mutex); - LanguageRuntimeCollection language_runtimes(m_language_runtimes); - for (const auto &pair : language_runtimes) { - // We must check language_runtime_sp to make sure it is not nullptr as we - // might cache the fact that we didn't have a language runtime for a - // language. - LanguageRuntimeSP language_runtime_sp = pair.second; - if (language_runtime_sp) - language_runtime_sp->ModulesDidLoad(module_list); - } + // Give the language runtimes a chance to be created before informing them of + // the modified modules. + for (const lldb::LanguageType lang_type : Language::GetSupportedLanguages()) { + if (LanguageRuntime *runtime = GetLanguageRuntime(lang_type)) + runtime->ModulesDidLoad(module_list); } // If we don't have an operating system plug-in, try to load one since @@ -5799,7 +5786,7 @@ void Process::ModulesDidLoad(ModuleList &module_list) { if (!m_os_up) LoadOperatingSystemPlugin(false); - // Give structured-data plugins a chance to see the modified modules. + // Inform the structured-data plugins of the modified modules. for (auto pair : m_structured_data_plugin_map) { if (pair.second) pair.second->ModulesDidLoad(*this, module_list); diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 098aed9cd8125..22bca52d7f98a 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1408,7 +1408,7 @@ ValueObjectSP GetValueForOffset(StackFrame &frame, ValueObjectSP &parent, } int64_t child_offset = child_sp->GetByteOffset(); - int64_t child_size = child_sp->GetByteSize(); + int64_t child_size = child_sp->GetByteSize().getValueOr(0); if (offset >= child_offset && offset < (child_offset + child_size)) { return GetValueForOffset(frame, child_sp, offset - child_offset); @@ -1441,8 +1441,8 @@ ValueObjectSP GetValueForDereferincingOffset(StackFrame &frame, } if (offset >= 0 && uint64_t(offset) >= pointee->GetByteSize()) { - int64_t index = offset / pointee->GetByteSize(); - offset = offset % pointee->GetByteSize(); + int64_t index = offset / pointee->GetByteSize().getValueOr(1); + offset = offset % pointee->GetByteSize().getValueOr(1); const bool can_create = true; pointee = base->GetSyntheticArrayMember(index, can_create); } diff --git a/lldb/source/Target/SystemRuntime.cpp b/lldb/source/Target/SystemRuntime.cpp index cd3d8ba2c7b09..6d8a2ef55225f 100644 --- a/lldb/source/Target/SystemRuntime.cpp +++ b/lldb/source/Target/SystemRuntime.cpp @@ -27,9 +27,7 @@ SystemRuntime *SystemRuntime::FindPlugin(Process *process) { return nullptr; } -// SystemRuntime constructor -SystemRuntime::SystemRuntime(Process *process) - : m_process(process), m_types() {} +SystemRuntime::SystemRuntime(Process *process) : Runtime(process), m_types() {} SystemRuntime::~SystemRuntime() = default; @@ -39,7 +37,7 @@ void SystemRuntime::DidLaunch() {} void SystemRuntime::Detach() {} -void SystemRuntime::ModulesDidLoad(ModuleList &module_list) {} +void SystemRuntime::ModulesDidLoad(const ModuleList &module_list) {} const std::vector &SystemRuntime::GetExtendedBacktraceTypes() { return m_types; diff --git a/lldb/source/Target/TargetList.cpp b/lldb/source/Target/TargetList.cpp index 3974cb5de4198..7e243e5ed3380 100644 --- a/lldb/source/Target/TargetList.cpp +++ b/lldb/source/Target/TargetList.cpp @@ -75,55 +75,49 @@ Status TargetList::CreateTargetInternal( const OptionGroupPlatform *platform_options, TargetSP &target_sp, bool is_dummy_target) { Status error; - PlatformSP platform_sp; - // This is purposely left empty unless it is specified by triple_cstr. If not - // initialized via triple_cstr, then the currently selected platform will set - // the architecture correctly. + // Let's start by looking at the selected platform. + PlatformSP platform_sp = debugger.GetPlatformList().GetSelectedPlatform(); + + // This variable corresponds to the architecture specified by the triple + // string. If that string was empty the currently selected platform will + // determine the architecture. const ArchSpec arch(triple_str); - if (!triple_str.empty()) { - if (!arch.IsValid()) { - error.SetErrorStringWithFormat("invalid triple '%s'", - triple_str.str().c_str()); - return error; - } + if (!triple_str.empty() && !arch.IsValid()) { + error.SetErrorStringWithFormat("invalid triple '%s'", + triple_str.str().c_str()); + return error; } ArchSpec platform_arch(arch); - bool prefer_platform_arch = false; - - CommandInterpreter &interpreter = debugger.GetCommandInterpreter(); - - // let's see if there is already an existing platform before we go creating - // another... - platform_sp = debugger.GetPlatformList().GetSelectedPlatform(); - - if (platform_options && platform_options->PlatformWasSpecified()) { - // Create a new platform if it doesn't match the selected platform - if (!platform_options->PlatformMatches(platform_sp)) { - const bool select_platform = true; - platform_sp = platform_options->CreatePlatformWithOptions( - interpreter, arch, select_platform, error, platform_arch); - if (!platform_sp) - return error; - } + // Create a new platform if a platform was specified in the platform options + // and doesn't match the selected platform. + if (platform_options && platform_options->PlatformWasSpecified() && + !platform_options->PlatformMatches(platform_sp)) { + const bool select_platform = true; + platform_sp = platform_options->CreatePlatformWithOptions( + debugger.GetCommandInterpreter(), arch, select_platform, error, + platform_arch); + if (!platform_sp) + return error; } + bool prefer_platform_arch = false; + if (!user_exe_path.empty()) { - ModuleSpecList module_specs; - ModuleSpec module_spec; - module_spec.GetFileSpec().SetFile(user_exe_path, FileSpec::Style::native); + ModuleSpec module_spec(FileSpec(user_exe_path, FileSpec::Style::native)); FileSystem::Instance().Resolve(module_spec.GetFileSpec()); - // Resolve the executable in case we are given a path to a application - // bundle like a .app bundle on MacOSX + // bundle like a .app bundle on MacOSX. Host::ResolveExecutableInBundle(module_spec.GetFileSpec()); lldb::offset_t file_offset = 0; lldb::offset_t file_size = 0; + ModuleSpecList module_specs; const size_t num_specs = ObjectFile::GetModuleSpecifications( module_spec.GetFileSpec(), file_offset, file_size, module_specs); + if (num_specs > 0) { ModuleSpec matching_module_spec; @@ -134,7 +128,7 @@ Status TargetList::CreateTargetInternal( matching_module_spec.GetArchitecture())) { // If the OS or vendor weren't specified, then adopt the module's // architecture so that the platform matching can be more - // accurate + // accurate. if (!platform_arch.TripleOSWasSpecified() || !platform_arch.TripleVendorWasSpecified()) { prefer_platform_arch = true; @@ -155,113 +149,107 @@ Status TargetList::CreateTargetInternal( return error; } } else { - // Only one arch and none was specified + // Only one arch and none was specified. prefer_platform_arch = true; platform_arch = matching_module_spec.GetArchitecture(); } } + } else if (arch.IsValid()) { + // A (valid) architecture was specified. + module_spec.GetArchitecture() = arch; + if (module_specs.FindMatchingModuleSpec(module_spec, + matching_module_spec)) { + prefer_platform_arch = true; + platform_arch = matching_module_spec.GetArchitecture(); + } } else { - if (arch.IsValid()) { - module_spec.GetArchitecture() = arch; - if (module_specs.FindMatchingModuleSpec(module_spec, - matching_module_spec)) { - prefer_platform_arch = true; - platform_arch = matching_module_spec.GetArchitecture(); - } - } else { - // No architecture specified, check if there is only one platform for - // all of the architectures. - - typedef std::vector PlatformList; - PlatformList platforms; - PlatformSP host_platform_sp = Platform::GetHostPlatform(); - for (size_t i = 0; i < num_specs; ++i) { - ModuleSpec module_spec; - if (module_specs.GetModuleSpecAtIndex(i, module_spec)) { - // See if there was a selected platform and check that first - // since the user may have specified it. - if (platform_sp) { - if (platform_sp->IsCompatibleArchitecture( - module_spec.GetArchitecture(), false, nullptr)) { - platforms.push_back(platform_sp); - continue; - } - } - - // Next check the host platform it if wasn't already checked - // above - if (host_platform_sp && - (!platform_sp || - host_platform_sp->GetName() != platform_sp->GetName())) { - if (host_platform_sp->IsCompatibleArchitecture( - module_spec.GetArchitecture(), false, nullptr)) { - platforms.push_back(host_platform_sp); - continue; - } + // No architecture specified, check if there is only one platform for + // all of the architectures. + PlatformSP host_platform_sp = Platform::GetHostPlatform(); + std::vector platforms; + for (size_t i = 0; i < num_specs; ++i) { + ModuleSpec module_spec; + if (module_specs.GetModuleSpecAtIndex(i, module_spec)) { + // First consider the platform specified by the user, if any, and + // the selected platform otherwise. + if (platform_sp) { + if (platform_sp->IsCompatibleArchitecture( + module_spec.GetArchitecture(), false, nullptr)) { + platforms.push_back(platform_sp); + continue; } + } - // Just find a platform that matches the architecture in the - // executable file - PlatformSP fallback_platform_sp( - Platform::GetPlatformForArchitecture( - module_spec.GetArchitecture(), nullptr)); - if (fallback_platform_sp) { - platforms.push_back(fallback_platform_sp); + // Now consider the host platform if it is different from the + // specified/selected platform. + if (host_platform_sp && + (!platform_sp || + host_platform_sp->GetName() != platform_sp->GetName())) { + if (host_platform_sp->IsCompatibleArchitecture( + module_spec.GetArchitecture(), false, nullptr)) { + platforms.push_back(host_platform_sp); + continue; } } - } - Platform *platform_ptr = nullptr; - bool more_than_one_platforms = false; - for (const auto &the_platform_sp : platforms) { - if (platform_ptr) { - if (platform_ptr->GetName() != the_platform_sp->GetName()) { - more_than_one_platforms = true; - platform_ptr = nullptr; - break; - } - } else { - platform_ptr = the_platform_sp.get(); + // Finally find a platform that matches the architecture in the + // executable file. + PlatformSP fallback_platform_sp( + Platform::GetPlatformForArchitecture( + module_spec.GetArchitecture(), nullptr)); + if (fallback_platform_sp) { + platforms.push_back(fallback_platform_sp); } } + } + Platform *platform_ptr = nullptr; + bool more_than_one_platforms = false; + for (const auto &the_platform_sp : platforms) { if (platform_ptr) { - // All platforms for all modules in the executable match, so we can - // select this platform - platform_sp = platforms.front(); - } else if (!more_than_one_platforms) { - // No platforms claim to support this file - error.SetErrorString("No matching platforms found for this file, " - "specify one with the --platform option"); - return error; + if (platform_ptr->GetName() != the_platform_sp->GetName()) { + more_than_one_platforms = true; + platform_ptr = nullptr; + break; + } } else { - // More than one platform claims to support this file, so the - // --platform option must be specified - StreamString error_strm; - std::set platform_set; - error_strm.Printf( - "more than one platform supports this executable ("); - for (const auto &the_platform_sp : platforms) { - if (platform_set.find(the_platform_sp.get()) == - platform_set.end()) { - if (!platform_set.empty()) - error_strm.PutCString(", "); - error_strm.PutCString(the_platform_sp->GetName().GetCString()); - platform_set.insert(the_platform_sp.get()); - } + platform_ptr = the_platform_sp.get(); + } + } + + if (platform_ptr) { + // All platforms for all modules in the executable match, so we can + // select this platform. + platform_sp = platforms.front(); + } else if (!more_than_one_platforms) { + // No platforms claim to support this file. + error.SetErrorString("no matching platforms found for this file"); + return error; + } else { + // More than one platform claims to support this file. + StreamString error_strm; + std::set platform_set; + error_strm.Printf( + "more than one platform supports this executable ("); + for (const auto &the_platform_sp : platforms) { + if (platform_set.find(the_platform_sp.get()) == + platform_set.end()) { + if (!platform_set.empty()) + error_strm.PutCString(", "); + error_strm.PutCString(the_platform_sp->GetName().GetCString()); + platform_set.insert(the_platform_sp.get()); } - error_strm.Printf( - "), use the --platform option to specify a platform"); - error.SetErrorString(error_strm.GetString()); - return error; } + error_strm.Printf("), specify an architecture to disambiguate"); + error.SetErrorString(error_strm.GetString()); + return error; } } } } // If we have a valid architecture, make sure the current platform is - // compatible with that architecture + // compatible with that architecture. if (!prefer_platform_arch && arch.IsValid()) { if (!platform_sp->IsCompatibleArchitecture(arch, false, &platform_arch)) { platform_sp = Platform::GetPlatformForArchitecture(arch, &platform_arch); @@ -269,8 +257,8 @@ Status TargetList::CreateTargetInternal( debugger.GetPlatformList().SetSelectedPlatform(platform_sp); } } else if (platform_arch.IsValid()) { - // if "arch" isn't valid, yet "platform_arch" is, it means we have an - // executable file with a single architecture which should be used + // If "arch" isn't valid, yet "platform_arch" is, it means we have an + // executable file with a single architecture which should be used. ArchSpec fixed_platform_arch; if (!platform_sp->IsCompatibleArchitecture(platform_arch, false, &fixed_platform_arch)) { @@ -284,10 +272,9 @@ Status TargetList::CreateTargetInternal( if (!platform_arch.IsValid()) platform_arch = arch; - error = TargetList::CreateTargetInternal( + return TargetList::CreateTargetInternal( debugger, user_exe_path, platform_arch, load_dependent_files, platform_sp, target_sp, is_dummy_target); - return error; } lldb::TargetSP TargetList::GetDummyTarget(lldb_private::Debugger &debugger) { @@ -407,37 +394,38 @@ Status TargetList::CreateTargetInternal(Debugger &debugger, target_sp.reset(new Target(debugger, arch, platform_sp, is_dummy_target)); } - if (target_sp) { - // Set argv0 with what the user typed, unless the user specified a - // directory. If the user specified a directory, then it is probably a - // bundle that was resolved and we need to use the resolved bundle path - if (!user_exe_path.empty()) { - // Use exactly what the user typed as the first argument when we exec or - // posix_spawn - if (user_exe_path_is_bundle && resolved_bundle_exe_path[0]) { - target_sp->SetArg0(resolved_bundle_exe_path); - } else { - // Use resolved path - target_sp->SetArg0(file.GetPath().c_str()); - } - } - if (file.GetDirectory()) { - FileSpec file_dir; - file_dir.GetDirectory() = file.GetDirectory(); - target_sp->AppendExecutableSearchPaths(file_dir); - } + if (!target_sp) + return error; - // Don't put the dummy target in the target list, it's held separately. - if (!is_dummy_target) { - std::lock_guard guard(m_target_list_mutex); - m_selected_target_idx = m_target_list.size(); - m_target_list.push_back(target_sp); - // Now prime this from the dummy target: - target_sp->PrimeFromDummyTarget(debugger.GetDummyTarget()); + // Set argv0 with what the user typed, unless the user specified a + // directory. If the user specified a directory, then it is probably a + // bundle that was resolved and we need to use the resolved bundle path + if (!user_exe_path.empty()) { + // Use exactly what the user typed as the first argument when we exec or + // posix_spawn + if (user_exe_path_is_bundle && resolved_bundle_exe_path[0]) { + target_sp->SetArg0(resolved_bundle_exe_path); } else { - m_dummy_target_sp = target_sp; + // Use resolved path + target_sp->SetArg0(file.GetPath().c_str()); } } + if (file.GetDirectory()) { + FileSpec file_dir; + file_dir.GetDirectory() = file.GetDirectory(); + target_sp->AppendExecutableSearchPaths(file_dir); + } + + // Don't put the dummy target in the target list, it's held separately. + if (!is_dummy_target) { + std::lock_guard guard(m_target_list_mutex); + m_selected_target_idx = m_target_list.size(); + m_target_list.push_back(target_sp); + // Now prime this from the dummy target: + target_sp->PrimeFromDummyTarget(debugger.GetDummyTarget()); + } else { + m_dummy_target_sp = target_sp; + } return error; } diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index 24cf4bf3ee1ed..ad28603cc02ec 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -1661,7 +1661,7 @@ Thread::GetStackFrameSPForStackFramePtr(StackFrame *stack_frame_ptr) { return GetStackFrameList()->GetStackFrameSPForStackFramePtr(stack_frame_ptr); } -const char *Thread::StopReasonAsCString(lldb::StopReason reason) { +std::string Thread::StopReasonAsString(lldb::StopReason reason) { switch (reason) { case eStopReasonInvalid: return "invalid"; @@ -1687,13 +1687,10 @@ const char *Thread::StopReasonAsCString(lldb::StopReason reason) { return "instrumentation break"; } - static char unknown_state_string[64]; - snprintf(unknown_state_string, sizeof(unknown_state_string), - "StopReason = %i", reason); - return unknown_state_string; + return "StopReason = " + std::to_string(reason); } -const char *Thread::RunModeAsCString(lldb::RunMode mode) { +std::string Thread::RunModeAsString(lldb::RunMode mode) { switch (mode) { case eOnlyThisThread: return "only this thread"; @@ -1703,10 +1700,7 @@ const char *Thread::RunModeAsCString(lldb::RunMode mode) { return "only during stepping"; } - static char unknown_state_string[64]; - snprintf(unknown_state_string, sizeof(unknown_state_string), "RunMode = %i", - mode); - return unknown_state_string; + return "RunMode = " + std::to_string(mode); } size_t Thread::GetStatus(Stream &strm, uint32_t start_frame, diff --git a/lldb/source/Target/ThreadPlanCallFunction.cpp b/lldb/source/Target/ThreadPlanCallFunction.cpp index dbe26f42c9bf7..f525173f8a51f 100644 --- a/lldb/source/Target/ThreadPlanCallFunction.cpp +++ b/lldb/source/Target/ThreadPlanCallFunction.cpp @@ -260,9 +260,9 @@ bool ThreadPlanCallFunction::DoPlanExplainsStop(Event *event_ptr) { stop_reason = eStopReasonNone; else stop_reason = m_real_stop_info_sp->GetStopReason(); - LLDB_LOGF(log, - "ThreadPlanCallFunction::PlanExplainsStop: Got stop reason - %s.", - Thread::StopReasonAsCString(stop_reason)); + LLDB_LOG(log, + "ThreadPlanCallFunction::PlanExplainsStop: Got stop reason - {0}.", + Thread::StopReasonAsString(stop_reason)); if (stop_reason == eStopReasonBreakpoint && BreakpointsExplainStop()) return true; diff --git a/lldb/source/Target/ThreadPlanStepOverBreakpoint.cpp b/lldb/source/Target/ThreadPlanStepOverBreakpoint.cpp index f3d35a91fcbc3..f188d827faae4 100644 --- a/lldb/source/Target/ThreadPlanStepOverBreakpoint.cpp +++ b/lldb/source/Target/ThreadPlanStepOverBreakpoint.cpp @@ -62,8 +62,8 @@ bool ThreadPlanStepOverBreakpoint::DoPlanExplainsStop(Event *event_ptr) { StopReason reason = stop_info_sp->GetStopReason(); Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_STEP)); - LLDB_LOGF(log, "Step over breakpoint stopped for reason: %s.", - Thread::StopReasonAsCString(reason)); + LLDB_LOG(log, "Step over breakpoint stopped for reason: {0}.", + Thread::StopReasonAsString(reason)); switch (reason) { case eStopReasonTrace: diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp index f220f4e30b29e..6e4f1b5326dd9 100644 --- a/lldb/source/Utility/ArchSpec.cpp +++ b/lldb/source/Utility/ArchSpec.cpp @@ -846,6 +846,15 @@ bool ArchSpec::ContainsOnlyArch(const llvm::Triple &normalized_triple) { } void ArchSpec::MergeFrom(const ArchSpec &other) { + // ios-macabi always wins over macosx. + if ((GetTriple().getOS() == llvm::Triple::MacOSX || + GetTriple().getOS() == llvm::Triple::UnknownOS) && + other.GetTriple().getOS() == llvm::Triple::IOS && + other.GetTriple().getEnvironment() == llvm::Triple::MacABI) { + (*this) = other; + return; + } + if (!TripleVendorWasSpecified() && other.TripleVendorWasSpecified()) GetTriple().setVendor(other.GetTriple().getVendor()); if (!TripleOSWasSpecified() && other.TripleOSWasSpecified()) @@ -978,6 +987,12 @@ static bool IsCompatibleEnvironment(llvm::Triple::EnvironmentType lhs, if (lhs == rhs) return true; + // Apple simulators are a different platform than what they simulate. + // As the environments are different at this point, if one of them is a + // simulator, then they are different. + if (lhs == llvm::Triple::Simulator || rhs == llvm::Triple::Simulator) + return false; + // If any of the environment is unknown then they are compatible if (lhs == llvm::Triple::UnknownEnvironment || rhs == llvm::Triple::UnknownEnvironment) @@ -1001,58 +1016,70 @@ static bool IsCompatibleEnvironment(llvm::Triple::EnvironmentType lhs, bool ArchSpec::IsEqualTo(const ArchSpec &rhs, bool exact_match) const { // explicitly ignoring m_distribution_id in this method. - if (GetByteOrder() != rhs.GetByteOrder()) + if (GetByteOrder() != rhs.GetByteOrder() || + !cores_match(GetCore(), rhs.GetCore(), true, exact_match)) return false; - const ArchSpec::Core lhs_core = GetCore(); - const ArchSpec::Core rhs_core = rhs.GetCore(); + const llvm::Triple &lhs_triple = GetTriple(); + const llvm::Triple &rhs_triple = rhs.GetTriple(); + + const llvm::Triple::VendorType lhs_triple_vendor = lhs_triple.getVendor(); + const llvm::Triple::VendorType rhs_triple_vendor = rhs_triple.getVendor(); + if (lhs_triple_vendor != rhs_triple_vendor) { + const bool rhs_vendor_specified = rhs.TripleVendorWasSpecified(); + const bool lhs_vendor_specified = TripleVendorWasSpecified(); + // Both architectures had the vendor specified, so if they aren't equal + // then we return false + if (rhs_vendor_specified && lhs_vendor_specified) + return false; + + // Only fail if both vendor types are not unknown + if (lhs_triple_vendor != llvm::Triple::UnknownVendor && + rhs_triple_vendor != llvm::Triple::UnknownVendor) + return false; + } - const bool core_match = cores_match(lhs_core, rhs_core, true, exact_match); - - if (core_match) { - const llvm::Triple &lhs_triple = GetTriple(); - const llvm::Triple &rhs_triple = rhs.GetTriple(); - - const llvm::Triple::VendorType lhs_triple_vendor = lhs_triple.getVendor(); - const llvm::Triple::VendorType rhs_triple_vendor = rhs_triple.getVendor(); - if (lhs_triple_vendor != rhs_triple_vendor) { - const bool rhs_vendor_specified = rhs.TripleVendorWasSpecified(); - const bool lhs_vendor_specified = TripleVendorWasSpecified(); - // Both architectures had the vendor specified, so if they aren't equal - // then we return false - if (rhs_vendor_specified && lhs_vendor_specified) - return false; - - // Only fail if both vendor types are not unknown - if (lhs_triple_vendor != llvm::Triple::UnknownVendor && - rhs_triple_vendor != llvm::Triple::UnknownVendor) - return false; - } + const llvm::Triple::OSType lhs_triple_os = lhs_triple.getOS(); + const llvm::Triple::OSType rhs_triple_os = rhs_triple.getOS(); + const llvm::Triple::EnvironmentType lhs_triple_env = + lhs_triple.getEnvironment(); + const llvm::Triple::EnvironmentType rhs_triple_env = + rhs_triple.getEnvironment(); + + if (!exact_match) { + // x86_64-apple-ios-macabi, x86_64-apple-macosx are compatible, no match. + if ((lhs_triple_os == llvm::Triple::IOS && + lhs_triple_env == llvm::Triple::MacABI && + rhs_triple_os == llvm::Triple::MacOSX) || + (lhs_triple_os == llvm::Triple::MacOSX && + rhs_triple_os == llvm::Triple::IOS && + rhs_triple_env == llvm::Triple::MacABI)) + return true; + } - const llvm::Triple::OSType lhs_triple_os = lhs_triple.getOS(); - const llvm::Triple::OSType rhs_triple_os = rhs_triple.getOS(); - if (lhs_triple_os != rhs_triple_os) { - const bool rhs_os_specified = rhs.TripleOSWasSpecified(); - const bool lhs_os_specified = TripleOSWasSpecified(); - // Both architectures had the OS specified, so if they aren't equal then - // we return false - if (rhs_os_specified && lhs_os_specified) - return false; - - // Only fail if both os types are not unknown - if (lhs_triple_os != llvm::Triple::UnknownOS && - rhs_triple_os != llvm::Triple::UnknownOS) - return false; - } + if (lhs_triple_os != rhs_triple_os) { + const bool rhs_os_specified = rhs.TripleOSWasSpecified(); + const bool lhs_os_specified = TripleOSWasSpecified(); + // Both architectures had the OS specified, so if they aren't equal then + // we return false + if (rhs_os_specified && lhs_os_specified) + return false; + + // Only fail if both os types are not unknown + if (lhs_triple_os != llvm::Triple::UnknownOS && + rhs_triple_os != llvm::Triple::UnknownOS) + return false; + } - const llvm::Triple::EnvironmentType lhs_triple_env = - lhs_triple.getEnvironment(); - const llvm::Triple::EnvironmentType rhs_triple_env = - rhs_triple.getEnvironment(); + // x86_64-apple-ios-macabi and x86_64-apple-ios are not compatible. + if (lhs_triple_os == llvm::Triple::IOS && + rhs_triple_os == llvm::Triple::IOS && + (lhs_triple_env == llvm::Triple::MacABI || + rhs_triple_env == llvm::Triple::MacABI) && + lhs_triple_env != rhs_triple_env) + return false; - return IsCompatibleEnvironment(lhs_triple_env, rhs_triple_env); - } - return false; + return IsCompatibleEnvironment(lhs_triple_env, rhs_triple_env); } void ArchSpec::UpdateCore() { diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp index 27d5b3b88d338..9309f8d662da8 100644 --- a/lldb/source/Utility/Scalar.cpp +++ b/lldb/source/Utility/Scalar.cpp @@ -126,8 +126,6 @@ static Scalar::Type PromoteToMaxType( return Scalar::e_void; } -Scalar::Scalar() : m_type(e_void), m_float(static_cast(0)) {} - bool Scalar::GetData(DataExtractor &data, size_t limit_byte_size) const { size_t byte_size = GetByteSize(); if (byte_size == 0) { @@ -232,8 +230,6 @@ void Scalar::GetValue(Stream *s, bool show_type) const { } } -Scalar::~Scalar() = default; - Scalar::Type Scalar::GetBestTypeForBitSize(size_t bit_size, bool sign) { // Scalar types are always host types, hence the sizeof(). if (sign) { diff --git a/lldb/test/API/commands/target/basic/TestTargetCommand.py b/lldb/test/API/commands/target/basic/TestTargetCommand.py index 83e27e2724642..be6eeb938ab8b 100644 --- a/lldb/test/API/commands/target/basic/TestTargetCommand.py +++ b/lldb/test/API/commands/target/basic/TestTargetCommand.py @@ -115,6 +115,33 @@ def do_target_command(self): self.runCmd("target list") + @no_debug_info_test + def test_target_create_invalid_arch(self): + exe = self.getBuildArtifact("a.out") + self.expect("target create {} --arch doesntexist".format(exe), error=True, + patterns=["error: invalid triple 'doesntexist'"]) + + @no_debug_info_test + def test_target_create_platform(self): + self.buildB() + exe = self.getBuildArtifact("b.out") + self.expect("target create {} --platform host".format(exe)) + + @no_debug_info_test + def test_target_create_unsupported_platform(self): + yaml = os.path.join(self.getSourceDir(), "bogus.yaml") + exe = self.getBuildArtifact("bogus") + self.yaml2obj(yaml, exe) + self.expect("target create {}".format(exe), error=True, + patterns=['error: no matching platforms found for this file']) + + @no_debug_info_test + def test_target_create_invalid_platform(self): + self.buildB() + exe = self.getBuildArtifact("b.out") + self.expect("target create {} --platform doesntexist".format(exe), error=True, + patterns=['error: unable to find a plug-in for the platform named "doesntexist"']) + def do_target_variable_command(self, exe_name): """Exercise 'target variable' command before and after starting the inferior.""" self.runCmd("file " + self.getBuildArtifact(exe_name), diff --git a/lldb/test/API/commands/target/basic/bogus.yaml b/lldb/test/API/commands/target/basic/bogus.yaml new file mode 100644 index 0000000000000..d36695cdedd6a --- /dev/null +++ b/lldb/test/API/commands/target/basic/bogus.yaml @@ -0,0 +1,194 @@ +--- !fat-mach-o +FatHeader: + magic: 0xCAFEBABE + nfat_arch: 3 +FatArchs: + - cputype: 0x0000000C + cpusubtype: 0x00000009 + offset: 0x0000000000004000 + size: 200 + align: 14 + - cputype: 0x0000000C + cpusubtype: 0x0000000B + offset: 0x0000000000008000 + size: 200 + align: 14 + - cputype: 0x0100000C + cpusubtype: 0x00000000 + offset: 0x000000000000C000 + size: 332 + align: 14 +Slices: + - !mach-o + FileHeader: + magic: 0xFEEDFACE + # Bogus + cputype: 0x00000003 + cpusubtype: 0x00000009 + filetype: 0x00000001 + ncmds: 4 + sizeofcmds: 112 + flags: 0x00002000 + LoadCommands: + - cmd: LC_SEGMENT + cmdsize: 56 + segname: '' + vmaddr: 0 + vmsize: 0 + fileoff: 0 + filesize: 0 + maxprot: 7 + initprot: 7 + nsects: 0 + flags: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 172 + nsyms: 1 + stroff: 184 + strsize: 16 + - cmd: LC_VERSION_MIN_IPHONEOS + cmdsize: 16 + version: 327680 + sdk: 0 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 172 + datasize: 0 + LinkEditData: + NameList: + - n_strx: 4 + n_type: 0x01 + n_sect: 0 + n_desc: 512 + n_value: 4 + StringTable: + - '' + - '' + - '' + - '' + - _armv7_var + - '' + - !mach-o + FileHeader: + magic: 0xFEEDFACE + # Bogus + cputype: 0x00000002 + cpusubtype: 0x0000000B + filetype: 0x00000001 + ncmds: 4 + sizeofcmds: 112 + flags: 0x00002000 + LoadCommands: + - cmd: LC_SEGMENT + cmdsize: 56 + segname: '' + vmaddr: 0 + vmsize: 0 + fileoff: 0 + filesize: 0 + maxprot: 7 + initprot: 7 + nsects: 0 + flags: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 172 + nsyms: 1 + stroff: 184 + strsize: 16 + - cmd: LC_VERSION_MIN_IPHONEOS + cmdsize: 16 + version: 327680 + sdk: 0 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 172 + datasize: 0 + LinkEditData: + NameList: + - n_strx: 4 + n_type: 0x01 + n_sect: 0 + n_desc: 512 + n_value: 4 + StringTable: + - '' + - '' + - '' + - '' + - _armv7s_var + - !mach-o + FileHeader: + magic: 0xFEEDFACF + # Bogus + cputype: 0x00000001 + cpusubtype: 0x00000000 + filetype: 0x00000001 + ncmds: 4 + sizeofcmds: 208 + flags: 0x00002000 + reserved: 0x00000000 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: '' + vmaddr: 0 + vmsize: 0 + fileoff: 272 + filesize: 0 + maxprot: 7 + initprot: 7 + nsects: 1 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0000000000000000 + size: 0 + offset: 0x00000110 + align: 0 + reloff: 0x00000000 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x00000000 + reserved2: 0x00000000 + reserved3: 0x00000000 + content: '' + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 276 + nsyms: 2 + stroff: 308 + strsize: 24 + - cmd: LC_VERSION_MIN_IPHONEOS + cmdsize: 16 + version: 327680 + sdk: 0 + - cmd: LC_DATA_IN_CODE + cmdsize: 16 + dataoff: 276 + datasize: 0 + LinkEditData: + NameList: + - n_strx: 15 + n_type: 0x0E + n_sect: 1 + n_desc: 0 + n_value: 0 + - n_strx: 4 + n_type: 0x01 + n_sect: 0 + n_desc: 512 + n_value: 4 + StringTable: + - '' + - '' + - '' + - '' + - _arm64_var + - ltmp0 + - '' + - '' + - '' +... diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/base.py b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/base.py new file mode 100644 index 0000000000000..9593a72b3d403 --- /dev/null +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/base.py @@ -0,0 +1,19 @@ +""" +Base class for hardware breakpoints tests. +""" + +from lldbsuite.test.lldbtest import * + +class HardwareBreakpointTestBase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + + def supports_hw_breakpoints(self): + self.build() + self.runCmd("file " + self.getBuildArtifact("a.out"), + CURRENT_EXECUTABLE_SET) + self.runCmd("breakpoint set -b main --hardware") + self.runCmd("run") + if 'stopped' in self.res.GetOutput(): + return 'Hardware breakpoints are supported' + return None diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py index bca7b278631fb..01bf33693a23b 100644 --- a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py @@ -9,50 +9,28 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil -class HardwareBreakpointMultiThreadTestCase(TestBase): - NO_DEBUG_INFO_TESTCASE = True +from functionalities.breakpoint.hardware_breakpoints.base import * +class HardwareBreakpointMultiThreadTestCase(HardwareBreakpointTestBase): mydir = TestBase.compute_mydir(__file__) - # LLDB on linux supports hardware breakpoints for arm and aarch64 - # architectures. - @skipUnlessPlatform(oslist=['linux']) - @skipIf(archs=no_match(['arm', 'aarch64'])) - def test_hw_break_set_delete_multi_thread_linux(self): - self.build() - self.setTearDownCleanup() - self.break_multi_thread('delete', False) # llvm.org/PR44659 - - # LLDB on linux supports hardware breakpoints for arm and aarch64 - # architectures. - @skipUnlessPlatform(oslist=['linux']) - @skipIf(archs=no_match(['arm', 'aarch64'])) - def test_hw_break_set_disable_multi_thread_linux(self): - self.build() - self.setTearDownCleanup() - self.break_multi_thread('disable', False) # llvm.org/PR44659 + def does_not_support_hw_breakpoints(self): + return not super().supports_hw_breakpoints() - # LLDB on darwin supports hardware breakpoints for x86_64 and i386 - # architectures. - @skipUnlessDarwin @skipIfOutOfTreeDebugserver - @expectedFailureAll(archs=["arm64"]) + @skipTestIfFn(does_not_support_hw_breakpoints) def test_hw_break_set_delete_multi_thread_macos(self): self.build() self.setTearDownCleanup() self.break_multi_thread('delete') - # LLDB on darwin supports hardware breakpoints for x86_64 and i386 - # architectures. - @skipUnlessDarwin @skipIfOutOfTreeDebugserver - @expectedFailureAll(archs=["arm64"]) + @skipTestIfFn(does_not_support_hw_breakpoints) def test_hw_break_set_disable_multi_thread_macos(self): self.build() self.setTearDownCleanup() self.break_multi_thread('disable') - def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -62,7 +40,7 @@ def setUp(self): self.first_stop = line_number( self.source, 'Starting thread creation with hardware breakpoint set') - def break_multi_thread(self, removal_type, check_hw_bp=True): + def break_multi_thread(self, removal_type): """Test that lldb hardware breakpoints work for multiple threads.""" self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET) @@ -106,10 +84,9 @@ def break_multi_thread(self, removal_type, check_hw_bp=True): # Continue the loop and test that we are stopped 4 times. count += 1 - if check_hw_bp: - # Check the breakpoint list. - self.expect("breakpoint list", substrs=['hw_break_function', 'hardware']) - self.expect("breakpoint list -v", substrs=['function = hw_break_function', 'hardware = true']) + # Check the breakpoint list. + self.expect("breakpoint list", substrs=['hw_break_function', 'hardware']) + self.expect("breakpoint list -v", substrs=['function = hw_break_function', 'hardware = true']) if removal_type == 'delete': self.runCmd("settings set auto-confirm true") diff --git a/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile similarity index 100% rename from lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/Makefile rename to lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile diff --git a/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/TestRequireHWBreakpoints.py similarity index 87% rename from lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py rename to lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/TestRequireHWBreakpoints.py index 61e4171131013..dfb946036aa25 100644 --- a/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/TestRequireHWBreakpoints.py @@ -8,20 +8,13 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil +from functionalities.breakpoint.hardware_breakpoints.base import * -class BreakpointLocationsTestCase(TestBase): - NO_DEBUG_INFO_TESTCASE = True +class BreakpointLocationsTestCase(HardwareBreakpointTestBase): mydir = TestBase.compute_mydir(__file__) def supports_hw_breakpoints(self): - self.build() - self.runCmd("file " + self.getBuildArtifact("a.out"), - CURRENT_EXECUTABLE_SET) - self.runCmd("breakpoint set -b main --hardware") - self.runCmd("run") - if 'stopped' in self.res.GetOutput(): - return 'Hardware breakpoints are supported' - return None + return super().supports_hw_breakpoints() def test_breakpoint(self): """Test regular breakpoints when hardware breakpoints are required.""" @@ -34,7 +27,6 @@ def test_breakpoint(self): breakpoint = target.BreakpointCreateByLocation("main.c", 1) self.assertTrue(breakpoint.IsHardware()) - @skipIfWindows @expectedFailure(supports_hw_breakpoints) def test_step_range(self): """Test stepping when hardware breakpoints are required.""" @@ -56,7 +48,6 @@ def test_step_range(self): self.assertTrue("Could not create hardware breakpoint for thread plan" in error.GetCString()) - @skipIfWindows @expectedFailure(supports_hw_breakpoints) def test_step_out(self): """Test stepping out when hardware breakpoints are required.""" @@ -77,7 +68,6 @@ def test_step_out(self): self.assertTrue("Could not create hardware breakpoint for thread plan" in error.GetCString()) - @skipIfWindows @expectedFailure(supports_hw_breakpoints) def test_step_over(self): """Test stepping over when hardware breakpoints are required.""" @@ -96,7 +86,6 @@ def test_step_over(self): 'error: Could not create hardware breakpoint for thread plan.' ]) - @skipIfWindows @expectedFailure(supports_hw_breakpoints) def test_step_until(self): """Test stepping until when hardware breakpoints are required.""" diff --git a/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/main.c b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/main.c similarity index 100% rename from lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/main.c rename to lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/main.c diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/registers-target-xml-reading/Makefile b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/write_memory_with_hw_breakpoint/Makefile similarity index 100% rename from lldb/packages/Python/lldbsuite/test/tools/lldb-server/registers-target-xml-reading/Makefile rename to lldb/test/API/functionalities/breakpoint/hardware_breakpoints/write_memory_with_hw_breakpoint/Makefile diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/write_memory_with_hw_breakpoint/TestWriteMemoryWithHWBreakpoint.py b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/write_memory_with_hw_breakpoint/TestWriteMemoryWithHWBreakpoint.py new file mode 100644 index 0000000000000..842519be6f897 --- /dev/null +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/write_memory_with_hw_breakpoint/TestWriteMemoryWithHWBreakpoint.py @@ -0,0 +1,51 @@ +""" +Test that writing memory does't affect hardware breakpoints. +""" + + + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + +from functionalities.breakpoint.hardware_breakpoints.base import * + +class WriteMemoryWithHWBreakpoint(HardwareBreakpointTestBase): + mydir = TestBase.compute_mydir(__file__) + + def does_not_support_hw_breakpoints(self): + return not super().supports_hw_breakpoints() + + @skipTestIfFn(does_not_support_hw_breakpoints) + def test_copy_memory_with_hw_break(self): + self.build() + exe = self.getBuildArtifact("a.out") + + # Create a target by the debugger. + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + + # Run the program and stop at entry. + self.expect("process launch --stop-at-entry", + patterns=["Process .* launched: .*a.out"]) + + process = target.GetProcess() + self.assertTrue(process, PROCESS_IS_VALID) + + # Set a hardware breakpoint. + bp_id = lldbutil.run_break_set_by_symbol(self, "hw_break_function", + extra_options="--hardware") + + # Get breakpoint location from the breakpoint. + location = target.FindBreakpointByID(bp_id).GetLocationAtIndex(0) + self.assertTrue(location and location.IsResolved(), + VALID_BREAKPOINT_LOCATION) + + # Check that writing overlapping memory doesn't crash. + address = location.GetLoadAddress() + data = str("\x01\x02\x03\x04") + error = lldb.SBError() + + result = process.WriteMemory(address, data, error) + self.assertTrue(error.Success() and result == len(bytes)) diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/write_memory_with_hw_breakpoint/main.cpp b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/write_memory_with_hw_breakpoint/main.cpp new file mode 100644 index 0000000000000..bad219082cafe --- /dev/null +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/write_memory_with_hw_breakpoint/main.cpp @@ -0,0 +1,9 @@ +static volatile int num = 1; + +bool hw_break_function (int i) { + return num == i; +} + +int main (int argc, char const *argv[]) { + return hw_break_function(argc) ? 0 : 1; +} diff --git a/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py b/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py index 9408ad6eee1d1..aa383d0005e41 100644 --- a/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py +++ b/lldb/test/API/functionalities/limit-debug-info/TestLimitDebugInfo.py @@ -38,7 +38,8 @@ def test_one_and_two_debug(self): self._check_debug_info_is_limited(target) - self.registerSharedLibrariesWithTarget(target, ["one", "two"]) + lldbutil.run_to_name_breakpoint(self, "main", + extra_images=["one", "two"]) # But when other shared libraries are loaded, we should be able to see # all members. @@ -58,6 +59,10 @@ def test_one_and_two_debug(self): self.expect_expr("array_of_two[2].one[2].member", result_value="174") self.expect_expr("array_of_two[2].member", result_value="274") + self.expect_expr("get_one().member", result_value="124") + self.expect_expr("get_two().one().member", result_value="124") + self.expect_expr("get_two().member", result_value="224") + @skipIf(bugnumber="pr46284", debug_info="gmodules") @skipIfWindows # Clang emits type info even with -flimit-debug-info def test_two_debug(self): @@ -66,7 +71,8 @@ def test_two_debug(self): self._check_debug_info_is_limited(target) - self.registerSharedLibrariesWithTarget(target, ["one", "two"]) + lldbutil.run_to_name_breakpoint(self, "main", + extra_images=["one", "two"]) # This time, we should only see the members from the second library. self.expect_expr("inherits_from_one.member", result_value="47") @@ -91,6 +97,12 @@ def test_two_debug(self): substrs=["no member named 'member' in 'array::One'"]) self.expect_expr("array_of_two[2].member", result_value="274") + self.expect("expr get_one().member", error=True, + substrs=["calling 'get_one' with incomplete return type 'result::One'"]) + self.expect("expr get_two().one().member", error=True, + substrs=["calling 'one' with incomplete return type 'result::One'"]) + self.expect_expr("get_two().member", result_value="224") + @skipIf(bugnumber="pr46284", debug_info="gmodules") @skipIfWindows # Clang emits type info even with -flimit-debug-info def test_one_debug(self): @@ -99,7 +111,8 @@ def test_one_debug(self): self._check_debug_info_is_limited(target) - self.registerSharedLibrariesWithTarget(target, ["one", "two"]) + lldbutil.run_to_name_breakpoint(self, "main", + extra_images=["one", "two"]) # In this case we should only see the members from the second library. # Note that we cannot see inherits_from_two.one because without debug @@ -126,3 +139,9 @@ def test_one_debug(self): substrs=["no member named 'one' in 'array::Two'"]) self.expect("expr array_of_two[2].member", error=True, substrs=["no member named 'member' in 'array::Two'"]) + + self.expect_expr("get_one().member", result_value="124") + self.expect("expr get_two().one().member", error=True, + substrs=["calling 'get_two' with incomplete return type 'result::Two'"]) + self.expect("expr get_two().member", error=True, + substrs=["calling 'get_two' with incomplete return type 'result::Two'"]) diff --git a/lldb/test/API/functionalities/limit-debug-info/main.cpp b/lldb/test/API/functionalities/limit-debug-info/main.cpp index 0a25de13d9fb2..1aad7e6f1e610 100644 --- a/lldb/test/API/functionalities/limit-debug-info/main.cpp +++ b/lldb/test/API/functionalities/limit-debug-info/main.cpp @@ -25,4 +25,7 @@ struct TwoAsMember { array::One array_of_one[3]; array::Two array_of_two[3]; -int main() { return 0; } +result::One get_one() { return result::One(124); } +result::Two get_two() { return result::Two(224); } + +int main() { return get_one().member; } diff --git a/lldb/test/API/functionalities/limit-debug-info/one.cpp b/lldb/test/API/functionalities/limit-debug-info/one.cpp index c1eb6310dd679..70353a084edc5 100644 --- a/lldb/test/API/functionalities/limit-debug-info/one.cpp +++ b/lldb/test/API/functionalities/limit-debug-info/one.cpp @@ -3,3 +3,6 @@ One::~One() = default; member::One::~One() = default; array::One::~One() = default; + +result::One::One(int member) : member(member) {} +result::One::~One() = default; diff --git a/lldb/test/API/functionalities/limit-debug-info/onetwo.h b/lldb/test/API/functionalities/limit-debug-info/onetwo.h index 67609dd7ff61c..24a18f6a5dcc3 100644 --- a/lldb/test/API/functionalities/limit-debug-info/onetwo.h +++ b/lldb/test/API/functionalities/limit-debug-info/onetwo.h @@ -39,3 +39,18 @@ struct Two { virtual ~Two(); }; } // namespace array + +namespace result { +struct One { + int member; + One(int member); + virtual ~One(); +}; + +struct Two { + int member; + Two(int member); + One one() const; + virtual ~Two(); +}; +} // namespace result diff --git a/lldb/test/API/functionalities/limit-debug-info/two.cpp b/lldb/test/API/functionalities/limit-debug-info/two.cpp index 04683da6e9cce..468cb91f6a5c3 100644 --- a/lldb/test/API/functionalities/limit-debug-info/two.cpp +++ b/lldb/test/API/functionalities/limit-debug-info/two.cpp @@ -3,3 +3,7 @@ Two::~Two() = default; member::Two::~Two() = default; array::Two::~Two() = default; + +result::Two::Two(int member) : member(member) {} +result::Two::~Two() = default; +result::One result::Two::one() const { return One(member - 100); } diff --git a/lldb/test/API/lang/objc/bitfield_ivars/TestBitfieldIvars.py b/lldb/test/API/lang/objc/bitfield_ivars/TestBitfieldIvars.py index 6118854131024..4154bb144b350 100644 --- a/lldb/test/API/lang/objc/bitfield_ivars/TestBitfieldIvars.py +++ b/lldb/test/API/lang/objc/bitfield_ivars/TestBitfieldIvars.py @@ -14,9 +14,8 @@ def test(self): lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.m")) self.expect_expr("chb->hb->field1", result_type="unsigned int", result_value="0") - - ## FIXME field2 should have a value of 1 - self.expect("expr chb->hb->field2", matching=False, substrs = ["= 1"]) # this must happen second + ## This should happen second + self.expect_expr("chb->hb->field2", result_type="unsigned int", result_value="1") self.expect_expr("hb2->field1", result_type="unsigned int", result_value="10") self.expect_expr("hb2->field2", result_type="unsigned int", result_value="3") diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index a211abe6cc41c..e083e2fd9bebb 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -6,6 +6,7 @@ import platform import shlex import shutil +import subprocess import lit.formats @@ -33,7 +34,6 @@ def mkdir_p(path): def find_sanitizer_runtime(name): - import subprocess resource_dir = subprocess.check_output( [config.cmake_cxx_compiler, '-print-resource-dir']).decode('utf-8').strip() @@ -60,7 +60,6 @@ def find_python_interpreter(): return copied_python # Find the "real" python binary. - import shutil, subprocess real_python = subprocess.check_output([ config.python_executable, os.path.join(os.path.dirname(os.path.realpath(__file__)), diff --git a/lldb/test/API/macosx/indirect_symbol/Makefile b/lldb/test/API/macosx/indirect_symbol/Makefile index 929ed58f75757..9069302b39c4f 100644 --- a/lldb/test/API/macosx/indirect_symbol/Makefile +++ b/lldb/test/API/macosx/indirect_symbol/Makefile @@ -8,7 +8,8 @@ include Makefile.rules build-libindirect: indirect.c $(MAKE) -f $(MAKEFILE_RULES) \ - DYLIB_C_SOURCES=indirect.c DYLIB_NAME=indirect DYLIB_ONLY=YES + DYLIB_C_SOURCES=indirect.c DYLIB_NAME=indirect DYLIB_ONLY=YES \ + LD_EXTRAS="-Wl,-image_base,0x200000000" build-libreepxoprt: reexport.c $(MAKE) -f $(MAKEFILE_RULES) \ diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py index 6e67fdc879bef..824cb9eee295c 100644 --- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py +++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py @@ -27,18 +27,28 @@ def check_debugserver(self, log, expected_platform, expected_version): """scan the debugserver packet log""" logfile = open(log, "r") dylib_info = None - response = False + process_info_ostype = None + expect_dylib_info_response = False + expect_process_info_response = False for line in logfile: - if response: + if expect_dylib_info_response: while line[0] != '$': line = line[1:] line = line[1:] # Unescape '}'. dylib_info = json.loads(line.replace('}]','}')[:-4]) - response = False + expect_dylib_info_response = False if 'send packet: $jGetLoadedDynamicLibrariesInfos:{' in line: - response = True - + expect_dylib_info_response = True + if expect_process_info_response: + for pair in line.split(';'): + keyval = pair.split(':') + if len(keyval) == 2 and keyval[0] == 'ostype': + process_info_ostype = keyval[1] + if 'send packet: $qProcessInfo#' in line: + expect_process_info_response = True + + self.assertEquals(process_info_ostype, expected_platform) self.assertTrue(dylib_info) aout_info = None for image in dylib_info['images']: diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/.categories b/lldb/test/API/tools/lldb-server/.categories similarity index 100% rename from lldb/packages/Python/lldbsuite/test/tools/lldb-server/.categories rename to lldb/test/API/tools/lldb-server/.categories diff --git a/lldb/test/API/tools/lldb-server/registers-target-xml-reading/Makefile b/lldb/test/API/tools/lldb-server/registers-target-xml-reading/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/tools/lldb-server/registers-target-xml-reading/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py b/lldb/test/API/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py similarity index 96% rename from lldb/packages/Python/lldbsuite/test/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py rename to lldb/test/API/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py index 530e2ce80023b..5ef074c1802cf 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py +++ b/lldb/test/API/tools/lldb-server/registers-target-xml-reading/TestGdbRemoteTargetXmlPacket.py @@ -43,7 +43,7 @@ def test_g_target_xml_returns_correct_data(self): architecture = root.find("architecture") self.assertIsNotNone(architecture) - self.assertEqual(architecture.text, self.getArchitecture()) + self.assertIn(self.getArchitecture(), architecture.text) feature = root.find("feature") self.assertIsNotNone(feature) @@ -66,4 +66,4 @@ def test_g_target_xml_returns_correct_data(self): self.assertEqual(q_info_reg["format"], xml_info_reg.get("format")) self.assertEqual(q_info_reg["bitsize"], xml_info_reg.get("bitsize")) self.assertEqual(q_info_reg["offset"], xml_info_reg.get("offset")) - self.assertEqual(q_info_reg["encoding"], xml_info_reg.get("encoding")) \ No newline at end of file + self.assertEqual(q_info_reg["encoding"], xml_info_reg.get("encoding")) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/registers-target-xml-reading/main.cpp b/lldb/test/API/tools/lldb-server/registers-target-xml-reading/main.cpp similarity index 100% rename from lldb/packages/Python/lldbsuite/test/tools/lldb-server/registers-target-xml-reading/main.cpp rename to lldb/test/API/tools/lldb-server/registers-target-xml-reading/main.cpp diff --git a/lldb/test/API/tools/lldb-vscode/module/Makefile b/lldb/test/API/tools/lldb-vscode/module/Makefile index 1fb944b138937..b30baf48b972e 100644 --- a/lldb/test/API/tools/lldb-vscode/module/Makefile +++ b/lldb/test/API/tools/lldb-vscode/module/Makefile @@ -2,12 +2,16 @@ DYLIB_NAME := foo DYLIB_CXX_SOURCES := foo.cpp CXX_SOURCES := main.cpp -all: a.out.stripped +LD_EXTRAS := -Wl,-rpath "-Wl,$(shell pwd)" +USE_LIBDL :=1 include Makefile.rules -a.out.stripped: a.out.dSYM +all: a.out.stripped + +a.out.stripped: strip -o a.out.stripped a.out + ifneq "$(CODESIGN)" "" $(CODESIGN) -fs - a.out.stripped -endif +endif \ No newline at end of file diff --git a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py index 40c4145b38e36..a16430fccae1d 100644 --- a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py +++ b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py @@ -10,56 +10,93 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbvscode_testcase +import re class TestVSCode_module(lldbvscode_testcase.VSCodeTestCaseBase): mydir = TestBase.compute_mydir(__file__) - - @skipIfWindows - @skipUnlessDarwin - @skipIfRemote - def test_modules_event(self): + def run_test(self, symbol_basename, expect_debug_info_size): program_basename = "a.out.stripped" - program= self.getBuildArtifact(program_basename) + program = self.getBuildArtifact(program_basename) self.build_and_launch(program) functions = ['foo'] breakpoint_ids = self.set_function_breakpoints(functions) - self.assertEquals(len(breakpoint_ids), len(functions), - 'expect one breakpoint') + self.assertEquals(len(breakpoint_ids), len(functions), 'expect one breakpoint') self.continue_to_breakpoints(breakpoint_ids) active_modules = self.vscode.get_active_modules() - self.assertIn(program_basename, active_modules, '%s module is in active modules' % (program_basename)) program_module = active_modules[program_basename] + self.assertIn(program_basename, active_modules, '%s module is in active modules' % (program_basename)) self.assertIn('name', program_module, 'make sure name is in module') self.assertEqual(program_basename, program_module['name']) self.assertIn('path', program_module, 'make sure path is in module') self.assertEqual(program, program_module['path']) self.assertTrue('symbolFilePath' not in program_module, 'Make sure a.out.stripped has no debug info') self.assertEqual('Symbols not found.', program_module['symbolStatus']) - symbol_path = self.getBuildArtifact("a.out") - self.vscode.request_evaluate('`%s' % ('target symbols add -s "%s" "%s"' % (program, symbol_path))) + symbols_path = self.getBuildArtifact(symbol_basename) + self.vscode.request_evaluate('`%s' % ('target symbols add -s "%s" "%s"' % (program, symbols_path))) def checkSymbolsLoaded(): active_modules = self.vscode.get_active_modules() program_module = active_modules[program_basename] return 'Symbols loaded.' == program_module['symbolStatus'] - self.waitUntil(checkSymbolsLoaded) + def checkSymbolsLoadedWithSize(): + active_modules = self.vscode.get_active_modules() + program_module = active_modules[program_basename] + symbolsStatus = program_module['symbolStatus'] + symbol_regex = re.compile(r"Symbols loaded. \([0-9]+(\.[0-9]*)?[KMG]?B\)") + return symbol_regex.match(program_module['symbolStatus']) + + if expect_debug_info_size: + self.waitUntil(checkSymbolsLoadedWithSize) + else: + self.waitUntil(checkSymbolsLoaded) active_modules = self.vscode.get_active_modules() program_module = active_modules[program_basename] self.assertEqual(program_basename, program_module['name']) self.assertEqual(program, program_module['path']) - self.assertEqual('Symbols loaded.', program_module['symbolStatus']) self.assertIn('symbolFilePath', program_module) - self.assertEqual(symbol_path, program_module['symbolFilePath']) + self.assertIn(symbols_path, program_module['symbolFilePath']) self.assertIn('addressRange', program_module) + @skipIfWindows + @skipUnlessDarwin + @skipIfRemote + #TODO: Update the Makefile so that this test runs on Linux + def test_module_event(self): + ''' + Mac or linux. + + On mac, if we load a.out as our symbol file, we will use DWARF with .o files and we will + have debug symbols, but we won't see any debug info size because all of the DWARF + sections are in .o files. + + On other platforms, we expect a.out to have debug info, so we will expect a size. + expect_debug_info_size = platform.system() != 'Darwin' + return self.run_test("a.out", expect_debug_info_size) + ''' + expect_debug_info_size = platform.system() != 'Darwin' + return self.run_test("a.out", expect_debug_info_size) + + @skipIfWindows + @skipUnlessDarwin + @skipIfRemote + def test_module_event_dsym(self): + ''' + Darwin only test with dSYM file. + + On mac, if we load a.out.dSYM as our symbol file, we will have debug symbols and we + will have DWARF sections added to the module, so we will expect a size. + return self.run_test("a.out.dSYM", True) + ''' + return self.run_test("a.out.dSYM", True) + @skipIfWindows @skipUnlessDarwin @skipIfRemote def test_compile_units(self): - program= self.getBuildArtifact("a.out") + program = self.getBuildArtifact("a.out") self.build_and_launch(program) source = "main.cpp" main_source_path = self.getSourcePath(source) diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index 96931cbaeaf99..dc5af5e2defe7 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -216,6 +216,9 @@ add_dependencies(check-lldb-reproducers check-lldb-reproducers-capture) if(LLDB_BUILT_STANDALONE) # This has to happen *AFTER* add_lit_testsuite. if (EXISTS ${LLVM_MAIN_SRC_DIR}/utils/llvm-lit) + # LLVM's make_paths_relative uses Python3_EXECUTABLE which isn't set in a + # standalone LLDB build. + set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE}) add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/llvm-lit ${CMAKE_CURRENT_BINARY_DIR}/llvm-lit) endif() endif() diff --git a/lldb/test/Shell/Driver/TestError.test b/lldb/test/Shell/Driver/TestError.test new file mode 100644 index 0000000000000..3d34a72b14aba --- /dev/null +++ b/lldb/test/Shell/Driver/TestError.test @@ -0,0 +1,2 @@ +RUN: not %lldb --arch 2>&1 | FileCheck %s +CHECK: error: argument to '--arch' is missing diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_declaration-with-children.s b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_declaration-with-children.s new file mode 100644 index 0000000000000..7ed33ce50297b --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_declaration-with-children.s @@ -0,0 +1,160 @@ +# Test that a forward-declared (DW_AT_declaration) structure is treated as a +# forward-declaration even if it has children. These types can be produced due +# to vtable-based type homing, or other -flimit-debug-info optimizations. + +# REQUIRES: x86 + +# RUN: llvm-mc --triple x86_64-pc-linux %s --filetype=obj > %t +# RUN: %lldb %t -o "expr a" -o exit 2>&1 | FileCheck %s --check-prefix=EXPR +# RUN: %lldb %t -o "target var a" -o exit 2>&1 | FileCheck %s --check-prefix=VAR + +# EXPR: incomplete type 'A' where a complete type is required + +# FIXME: This should also produce some kind of an error. +# VAR: (A) a = {} + + .text +_ZN1AC2Ev: + retq +.LZN1AC2Ev_end: + + .data +a: + .quad $_ZTV1A+16 + .quad $0xdeadbeef + + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 8 # DW_FORM_string + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 19 # DW_TAG_structure_type + .byte 1 # DW_CHILDREN_yes + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 52 # DW_AT_artificial + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 8 # Abbreviation Code + .byte 15 # DW_TAG_pointer_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 10 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 100 # DW_AT_object_pointer + .byte 19 # DW_FORM_ref4 + .byte 71 # DW_AT_specification + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 11 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 52 # DW_AT_artificial + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .quad _ZN1AC2Ev # DW_AT_low_pc + .long .LZN1AC2Ev_end-_ZN1AC2Ev # DW_AT_high_pc + .byte 2 # Abbrev [2] DW_TAG_variable + .asciz "a" # DW_AT_name + .long .LA-.Lcu_begin0 # DW_AT_type + .byte 9 # DW_AT_location + .byte 3 + .quad a +.LA: + .byte 3 # Abbrev [3] DW_TAG_structure_type + .asciz "A" # DW_AT_name + # DW_AT_declaration + .byte 4 # Abbrev [4] DW_TAG_subprogram + .asciz "A" # DW_AT_name + # DW_AT_declaration + .byte 5 # Abbrev [5] DW_TAG_formal_parameter + .long .LAptr-.Lcu_begin0 # DW_AT_type + # DW_AT_artificial + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark +.LAptr: + .byte 8 # Abbrev [8] DW_TAG_pointer_type + .long .LA-.Lcu_begin0 # DW_AT_type + .byte 10 # Abbrev [10] DW_TAG_subprogram + .quad _ZN1AC2Ev # DW_AT_low_pc + .long .LZN1AC2Ev_end-_ZN1AC2Ev # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .long 147 # DW_AT_object_pointer + .long 68 # DW_AT_specification + .byte 11 # Abbrev [11] DW_TAG_formal_parameter + .byte 2 # DW_AT_location + .byte 145 + .byte 120 + .asciz "this" # DW_AT_name + .long .LAptr-.Lcu_begin0 # DW_AT_type + # DW_AT_artificial + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark +.Ldebug_info_end0: diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_TAG_GNU_call_site-DW_AT_low_pc.s b/lldb/test/Shell/SymbolFile/DWARF/DW_TAG_GNU_call_site-DW_AT_low_pc.s index 29cafebaa4405..8cc1cc1ed2f95 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/DW_TAG_GNU_call_site-DW_AT_low_pc.s +++ b/lldb/test/Shell/SymbolFile/DWARF/DW_TAG_GNU_call_site-DW_AT_low_pc.s @@ -79,7 +79,7 @@ main: .uleb128 0x2 # (DIE (0x2d) DW_TAG_subprogram) # DW_AT_external .asciz "main" # DW_AT_name: "main" - .long .Ltype_int # DW_AT_type + .long .Ltype_int - .Ldebug_info0 # DW_AT_type .quad .LFB2 # DW_AT_low_pc .quad .LFE2-.LFB2 # DW_AT_high_pc .uleb128 0x1 # DW_AT_frame_base @@ -87,7 +87,7 @@ main: # DW_AT_GNU_all_call_sites .uleb128 0x3 # (DIE (0x4f) DW_TAG_GNU_call_site) .quad .LVL4 # DW_AT_low_pc - .long .Lfunc_a # DW_AT_abstract_origin + .long .Lfunc_a - .Ldebug_info0 # DW_AT_abstract_origin .uleb128 0x4 # (DIE (0x5c) DW_TAG_GNU_call_site_parameter) .uleb128 0x1 # DW_AT_location .byte 0x55 # DW_OP_reg5 @@ -111,7 +111,7 @@ main: # DW_AT_GNU_all_call_sites .uleb128 0x7 # (DIE (0x86) DW_TAG_formal_parameter) .asciz "p" # DW_AT_name - .long .Ltype_int # DW_AT_type + .long .Ltype_int - .Ldebug_info0 # DW_AT_type .long .LLST0 # DW_AT_location .byte 0 # end of children of DIE 0x6a .byte 0 # end of children of DIE 0xb diff --git a/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp b/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp index 503939680c500..3c4b005cdf1be 100644 --- a/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp +++ b/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp @@ -106,7 +106,6 @@ class Class : public Base { // Test base class. int main() { MemberTest::Base B1; B1.Get(); - MemberTest::Class C1; MemberTest::Class::StaticMemberFunc(1, 10, 2); return 0; } diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp index af13a8f8208ba..3c1cd85dc310f 100644 --- a/lldb/tools/debugserver/source/DNB.cpp +++ b/lldb/tools/debugserver/source/DNB.cpp @@ -442,6 +442,39 @@ nub_process_t DNBProcessAttach(nub_process_t attach_pid, if (err_str && err_len > 0) err_str[0] = '\0'; + if (getenv("LLDB_DEBUGSERVER_PATH") == NULL) { + int mib[] = {CTL_KERN, KERN_PROC, KERN_PROC_PID, + static_cast(attach_pid)}; + struct kinfo_proc processInfo; + size_t bufsize = sizeof(processInfo); + if (sysctl(mib, (unsigned)(sizeof(mib) / sizeof(int)), &processInfo, + &bufsize, NULL, 0) == 0 && + bufsize > 0) { + + if ((processInfo.kp_proc.p_flag & P_TRANSLATED) == P_TRANSLATED) { + const char *translated_debugserver = + "/Library/Apple/usr/libexec/oah/debugserver"; + char fdstr[16]; + char pidstr[16]; + extern int communication_fd; + + if (communication_fd == -1) { + fprintf(stderr, "Trying to attach to a translated process with the " + "native debugserver, exiting...\n"); + exit(1); + } + + snprintf(fdstr, sizeof(fdstr), "--fd=%d", communication_fd); + snprintf(pidstr, sizeof(pidstr), "--attach=%d", attach_pid); + execl(translated_debugserver, "--native-regs", "--setsid", fdstr, + "--handoff-attach-from-native", pidstr, (char *)0); + DNBLogThreadedIf(LOG_PROCESS, "Failed to launch debugserver for " + "translated process: ", errno, strerror(errno)); + __builtin_trap(); + } + } + } + pid_t pid = INVALID_NUB_PROCESS; MachProcessSP processSP(new MachProcess); if (processSP.get()) { @@ -1393,7 +1426,10 @@ const char *DNBGetDeploymentInfo(nub_process_t pid, uint32_t& patch_version) { MachProcessSP procSP; if (GetProcessSP(pid, procSP)) { - // FIXME: This doesn't correct for older ios simulator and macCatalyst. + // FIXME: This doesn't return the correct result when xctest (a + // macOS binary) is loaded with the macCatalyst dyld platform + // override. The image info corrects for this, but qProcessInfo + // will return what is in the binary. auto info = procSP->GetDeploymentInfo(lc, load_command_address); major_version = info.major_version; minor_version = info.minor_version; diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.h b/lldb/tools/debugserver/source/MacOSX/MachProcess.h index c749dd8426c5d..9d712390ac2ac 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachProcess.h +++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.h @@ -236,9 +236,6 @@ class MachProcess { operator bool() { return platform > 0; } /// The Mach-O platform type; unsigned char platform = 0; - /// Pre-LC_BUILD_VERSION files don't disambiguate between ios and ios - /// simulator. - bool maybe_simulator = false; uint32_t major_version = 0; uint32_t minor_version = 0; uint32_t patch_version = 0; diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm index 8a35f605daa38..10eaf38ea4357 100644 --- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm +++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm @@ -617,7 +617,28 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options, info.major_version = vers_cmd.version >> 16; info.minor_version = (vers_cmd.version >> 8) & 0xffu; info.patch_version = vers_cmd.version & 0xffu; - info.maybe_simulator = true; + + // Disambiguate legacy simulator platforms. +#if (defined(__x86_64__) || defined(__i386__)) + // If we are running on Intel macOS, it is safe to assume this is + // really a back-deploying simulator binary. + switch (info.platform) { + case PLATFORM_IOS: + info.platform = PLATFORM_IOSSIMULATOR; + break; + case PLATFORM_TVOS: + info.platform = PLATFORM_TVOSSIMULATOR; + break; + case PLATFORM_WATCHOS: + info.platform = PLATFORM_WATCHOSSIMULATOR; + break; + } +#else + // On an Apple Silicon macOS host, there is no ambiguity. The only + // binaries that use legacy load commands are back-deploying + // native iOS binaries. All simulator binaries use the newer, + // unambiguous LC_BUILD_VERSION load commands. +#endif }; switch (cmd) { case LC_VERSION_MIN_IPHONEOS: @@ -778,34 +799,6 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options, uuid_copy(inf.uuid, uuidcmd.uuid); } if (DeploymentInfo deployment_info = GetDeploymentInfo(lc, load_cmds_p)) { - // Simulator support. If the platform is ambiguous, use the dyld info. - if (deployment_info.maybe_simulator) { - if (deployment_info.maybe_simulator) { -#if (defined(__x86_64__) || defined(__i386__)) - // If dyld doesn't return a platform, use a heuristic. - // If we are running on Intel macOS, it is safe to assume - // this is really a back-deploying simulator binary. - switch (deployment_info.platform) { - case PLATFORM_IOS: - deployment_info.platform = PLATFORM_IOSSIMULATOR; - break; - case PLATFORM_TVOS: - deployment_info.platform = PLATFORM_TVOSSIMULATOR; - break; - case PLATFORM_WATCHOS: - deployment_info.platform = PLATFORM_WATCHOSSIMULATOR; - break; - } -#else - // On an Apple Silicon macOS host, there is no - // ambiguity. The only binaries that use legacy load - // commands are back-deploying native iOS binaries. All - // simulator binaries use the newer, unambiguous - // LC_BUILD_VERSION load commands. - deployment_info.maybe_simulator = false; -#endif - } - } const char *lc_platform = GetPlatformString(deployment_info.platform); // macCatalyst support. // diff --git a/lldb/tools/debugserver/source/debugserver.cpp b/lldb/tools/debugserver/source/debugserver.cpp index 42205dedf4bb8..04cbd2c8b503e 100644 --- a/lldb/tools/debugserver/source/debugserver.cpp +++ b/lldb/tools/debugserver/source/debugserver.cpp @@ -156,6 +156,20 @@ RNBRunLoopMode RNBRunLoopGetStartModeFromRemote(RNBRemote *remote) { return eRNBRunLoopModeExit; } +// Check the name to see if it ends with .app +static bool is_dot_app (const char *app_name) { + size_t len = strlen(app_name); + if (len < 4) + return false; + + if (app_name[len - 4] == '.' && + app_name[len - 3] == 'a' && + app_name[len - 2] == 'p' && + app_name[len - 1] == 'p') + return true; + return false; +} + // This run loop mode will wait for the process to launch and hit its // entry point. It will currently ignore all events except for the // process state changed event, where it watches for the process stopped @@ -198,19 +212,21 @@ RNBRunLoopMode RNBRunLoopLaunchInferior(RNBRemote *remote, // Our default launch method is posix spawn launch_flavor = eLaunchFlavorPosixSpawn; + const bool dot_app = is_dot_app(inferior_argv[0]); + (void)dot_app; #if defined WITH_FBS // Check if we have an app bundle, if so launch using BackBoard Services. - if (strstr(inferior_argv[0], ".app")) { + if (dot_app) { launch_flavor = eLaunchFlavorFBS; } #elif defined WITH_BKS // Check if we have an app bundle, if so launch using BackBoard Services. - if (strstr(inferior_argv[0], ".app")) { + if (dot_app) { launch_flavor = eLaunchFlavorBKS; } #elif defined WITH_SPRINGBOARD // Check if we have an app bundle, if so launch using SpringBoard. - if (strstr(inferior_argv[0], ".app")) { + if (dot_app) { launch_flavor = eLaunchFlavorSpringBoard; } #endif @@ -878,6 +894,8 @@ static struct option g_long_options[] = { // -F localhost:1234 -- /bin/ls" {NULL, 0, NULL, 0}}; +int communication_fd = -1; + // main int main(int argc, char *argv[]) { // If debugserver is launched with DYLD_INSERT_LIBRARIES, unset it so we @@ -944,7 +962,6 @@ int main(int argc, char *argv[]) { int ch; int long_option_index = 0; int debug = 0; - int communication_fd = -1; std::string compile_options; std::string waitfor_pid_name; // Wait for a process that starts with this name std::string attach_pid_name; @@ -1498,17 +1515,17 @@ int main(int argc, char *argv[]) { #if defined WITH_FBS // Check if we have an app bundle, if so launch using SpringBoard. - if (waitfor_pid_name.find(".app") != std::string::npos) { + if (is_dot_app(waitfor_pid_name.c_str())) { launch_flavor = eLaunchFlavorFBS; } #elif defined WITH_BKS // Check if we have an app bundle, if so launch using SpringBoard. - if (waitfor_pid_name.find(".app") != std::string::npos) { + if (is_dot_app(waitfor_pid_name.c_str())) { launch_flavor = eLaunchFlavorBKS; } #elif defined WITH_SPRINGBOARD // Check if we have an app bundle, if so launch using SpringBoard. - if (waitfor_pid_name.find(".app") != std::string::npos) { + if (is_dot_app(waitfor_pid_name.c_str())) { launch_flavor = eLaunchFlavorSpringBoard; } #endif diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index cea9e5a44aa80..0cd1ffc57aa7a 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -853,10 +853,11 @@ int main(int argc, char const *argv[]) { // Parse arguments. LLDBOptTable T; - unsigned MAI; - unsigned MAC; + unsigned MissingArgIndex; + unsigned MissingArgCount; ArrayRef arg_arr = makeArrayRef(argv + 1, argc - 1); - opt::InputArgList input_args = T.ParseArgs(arg_arr, MAI, MAC); + opt::InputArgList input_args = + T.ParseArgs(arg_arr, MissingArgIndex, MissingArgCount); llvm::StringRef argv0 = llvm::sys::path::filename(argv[0]); if (input_args.hasArg(OPT_help)) { @@ -864,11 +865,19 @@ int main(int argc, char const *argv[]) { return 0; } + // Check for missing argument error. + if (MissingArgCount) { + WithColor::error() << "argument to '" + << input_args.getArgString(MissingArgIndex) + << "' is missing\n"; + } // Error out on unknown options. if (input_args.hasArg(OPT_UNKNOWN)) { for (auto *arg : input_args.filtered(OPT_UNKNOWN)) { WithColor::error() << "unknown option: " << arg->getSpelling() << '\n'; } + } + if (MissingArgCount || input_args.hasArg(OPT_UNKNOWN)) { llvm::errs() << "Use '" << argv0 << " --help' for a complete list of options.\n"; return 1; diff --git a/lldb/tools/lldb-vscode/JSONUtils.cpp b/lldb/tools/lldb-vscode/JSONUtils.cpp index 1ebaa5c377121..f6cdcf5a46cfc 100644 --- a/lldb/tools/lldb-vscode/JSONUtils.cpp +++ b/lldb/tools/lldb-vscode/JSONUtils.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include +#include +#include #include "llvm/ADT/Optional.h" #include "llvm/Support/FormatAdapters.h" @@ -327,6 +329,50 @@ llvm::json::Value CreateBreakpoint(lldb::SBBreakpoint &bp, return llvm::json::Value(std::move(object)); } +static uint64_t GetDebugInfoSizeInSection(lldb::SBSection section) { + uint64_t debug_info_size = 0; + llvm::StringRef section_name(section.GetName()); + if (section_name.startswith(".debug") || section_name.startswith("__debug") || + section_name.startswith(".apple") || section_name.startswith("__apple")) + debug_info_size += section.GetFileByteSize(); + size_t num_sub_sections = section.GetNumSubSections(); + for (size_t i = 0; i < num_sub_sections; i++) { + debug_info_size += + GetDebugInfoSizeInSection(section.GetSubSectionAtIndex(i)); + } + return debug_info_size; +} + +static uint64_t GetDebugInfoSize(lldb::SBModule module) { + uint64_t debug_info_size = 0; + size_t num_sections = module.GetNumSections(); + for (size_t i = 0; i < num_sections; i++) { + debug_info_size += GetDebugInfoSizeInSection(module.GetSectionAtIndex(i)); + } + return debug_info_size; +} + +static std::string ConvertDebugInfoSizeToString(uint64_t debug_info) { + std::ostringstream oss; + oss << " ("; + oss << std::fixed << std::setprecision(1); + + if (debug_info < 1024) { + oss << debug_info << "B"; + } else if (debug_info < 1024 * 1024) { + double kb = double(debug_info) / 1024.0; + oss << kb << "KB"; + } else if (debug_info < 1024 * 1024 * 1024) { + double mb = double(debug_info) / (1024.0 * 1024.0); + oss << mb << "MB"; + } else { + double gb = double(debug_info) / (1024.0 * 1024.0 * 1024.0); + oss << gb << "GB"; + ; + } + oss << ")"; + return oss.str(); +} llvm::json::Value CreateModule(lldb::SBModule &module) { llvm::json::Object object; if (!module.IsValid()) @@ -339,9 +385,15 @@ llvm::json::Value CreateModule(lldb::SBModule &module) { std::string module_path(module_path_arr); object.try_emplace("path", module_path); if (module.GetNumCompileUnits() > 0) { - object.try_emplace("symbolStatus", "Symbols loaded."); + std::string symbol_str = "Symbols loaded."; + uint64_t debug_info = GetDebugInfoSize(module); + if (debug_info > 0) { + symbol_str += ConvertDebugInfoSizeToString(debug_info); + } + object.try_emplace("symbolStatus", symbol_str); char symbol_path_arr[PATH_MAX]; - module.GetSymbolFileSpec().GetPath(symbol_path_arr, sizeof(symbol_path_arr)); + module.GetSymbolFileSpec().GetPath(symbol_path_arr, + sizeof(symbol_path_arr)); std::string symbol_path(symbol_path_arr); object.try_emplace("symbolFilePath", symbol_path); } else { @@ -352,8 +404,9 @@ llvm::json::Value CreateModule(lldb::SBModule &module) { object.try_emplace("addressRange", loaded_addr); std::string version_str; uint32_t version_nums[3]; - uint32_t num_versions = module.GetVersion(version_nums, sizeof(version_nums)/sizeof(uint32_t)); - for (uint32_t i=0; i + subsystems; +}; + +#ifdef __APPLE__ + +static void testSimPlatformArchHasSimEnvironment(llvm::StringRef name) { + Status error; + auto platform_sp = Platform::Create(ConstString(name), error); + ASSERT_TRUE(platform_sp); + int num_arches = 0; + + while (true) { + ArchSpec arch; + if (!platform_sp->GetSupportedArchitectureAtIndex(num_arches, arch)) + break; + EXPECT_EQ(arch.GetTriple().getEnvironment(), llvm::Triple::Simulator); + num_arches++; + } + + EXPECT_GT(num_arches, 0); +} + +TEST_F(PlatformAppleSimulatorTest, TestSimHasSimEnvionament) { + testSimPlatformArchHasSimEnvironment("ios-simulator"); + testSimPlatformArchHasSimEnvironment("tvos-simulator"); + testSimPlatformArchHasSimEnvironment("watchos-simulator"); +} + +TEST_F(PlatformAppleSimulatorTest, TestHostPlatformToSim) { + static const ArchSpec platform_arch( + HostInfo::GetArchitecture(HostInfo::eArchKindDefault)); + + const llvm::Triple::OSType sim_platforms[] = { + llvm::Triple::IOS, + llvm::Triple::TvOS, + llvm::Triple::WatchOS, + }; + + for (auto sim : sim_platforms) { + ArchSpec arch = platform_arch; + arch.GetTriple().setOS(sim); + arch.GetTriple().setEnvironment(llvm::Triple::Simulator); + + Status error; + auto platform_sp = Platform::Create(arch, nullptr, error); + EXPECT_TRUE(platform_sp); + } +} + +#endif diff --git a/lldb/unittests/Utility/ArchSpecTest.cpp b/lldb/unittests/Utility/ArchSpecTest.cpp index 4e8e2f3c34d71..ad0a8ac18cd19 100644 --- a/lldb/unittests/Utility/ArchSpecTest.cpp +++ b/lldb/unittests/Utility/ArchSpecTest.cpp @@ -306,6 +306,14 @@ TEST(ArchSpecTest, Compatibility) { ASSERT_FALSE(A.IsExactMatch(B)); ASSERT_FALSE(A.IsCompatibleMatch(B)); } + { + ArchSpec A("arm64-apple-ios"); + ArchSpec B("arm64-apple-ios-simulator"); + ASSERT_FALSE(A.IsExactMatch(B)); + ASSERT_FALSE(A.IsCompatibleMatch(B)); + ASSERT_FALSE(B.IsCompatibleMatch(A)); + ASSERT_FALSE(B.IsCompatibleMatch(A)); + } { ArchSpec A("arm64-*-*"); ArchSpec B("arm64-apple-ios"); @@ -328,6 +336,40 @@ TEST(ArchSpecTest, Compatibility) { ASSERT_TRUE(A.IsExactMatch(B)); ASSERT_TRUE(A.IsCompatibleMatch(B)); } + { + ArchSpec A("x86_64"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + // FIXME: The exact match also looks unintuitive. + ASSERT_TRUE(A.IsExactMatch(B)); + ASSERT_TRUE(A.IsCompatibleMatch(B)); + } + { + ArchSpec A("x86_64-apple-ios12.0.0"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + ASSERT_FALSE(A.IsExactMatch(B)); + ASSERT_FALSE(A.IsCompatibleMatch(B)); + } + { + ArchSpec A("x86_64-apple-macosx10.14.2"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + ASSERT_FALSE(A.IsExactMatch(B)); + ASSERT_TRUE(A.IsCompatibleMatch(B)); + } + { + ArchSpec A("x86_64-apple-macosx10.14.2"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + // ios-macabi wins. + A.MergeFrom(B); + ASSERT_TRUE(A.IsExactMatch(B)); + } + { + ArchSpec A("x86_64-apple-macosx10.14.2"); + ArchSpec B("x86_64-apple-ios12.0.0-macabi"); + ArchSpec C(B); + // ios-macabi wins. + B.MergeFrom(A); + ASSERT_TRUE(B.IsExactMatch(C)); + } } TEST(ArchSpecTest, OperatorBool) { diff --git a/lldb/unittests/Utility/RangeMapTest.cpp b/lldb/unittests/Utility/RangeMapTest.cpp index 8a243b6562181..97432dca983d3 100644 --- a/lldb/unittests/Utility/RangeMapTest.cpp +++ b/lldb/unittests/Utility/RangeMapTest.cpp @@ -12,6 +12,32 @@ using namespace lldb_private; +TEST(RangeVector, CombineConsecutiveRanges) { + using RangeVector = RangeVector; + using Entry = RangeVector::Entry; + + RangeVector V; + V.Append(0, 1); + V.Append(5, 1); + V.Append(6, 1); + V.Append(10, 9); + V.Append(15, 1); + V.Append(20, 9); + V.Append(21, 9); + V.Sort(); + V.CombineConsecutiveRanges(); + EXPECT_THAT(V, testing::ElementsAre(Entry(0, 1), Entry(5, 2), Entry(10, 9), + Entry(20, 10))); + + V.Clear(); + V.Append(0, 20); + V.Append(5, 1); + V.Append(10, 1); + V.Sort(); + V.CombineConsecutiveRanges(); + EXPECT_THAT(V, testing::ElementsAre(Entry(0, 20))); +} + using RangeDataVectorT = RangeDataVector; using EntryT = RangeDataVectorT::Entry; diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp index f6bc6a404c159..70ce0a81627d2 100644 --- a/lldb/unittests/Utility/ScalarTest.cpp +++ b/lldb/unittests/Utility/ScalarTest.cpp @@ -92,6 +92,7 @@ TEST(ScalarTest, Getters) { CheckConversion(0x8765432112345678ull); CheckConversion(42.25f); CheckConversion(42.25); + CheckConversion(42.25L); EXPECT_EQ(APInt(128, 1) << 70, Scalar(std::pow(2.0f, 70.0f)).SInt128(APInt())); EXPECT_EQ(APInt(128, -1, true) << 70, diff --git a/llvm/CREDITS.TXT b/llvm/CREDITS.TXT index e998fa6bb257d..461b95bdcd8b5 100644 --- a/llvm/CREDITS.TXT +++ b/llvm/CREDITS.TXT @@ -537,3 +537,7 @@ D: PowerPC Backend Developer N: Djordje Todorovic E: djordje.todorovic@rt-rk.com D: Debug Information + +N: Biplob Mishra +E: biplmish@in.ibm.com +D: PowerPC Analysis diff --git a/llvm/bindings/go/llvm/InstrumentationBindings.cpp b/llvm/bindings/go/llvm/InstrumentationBindings.cpp index 794657fbce381..71d43de254a37 100644 --- a/llvm/bindings/go/llvm/InstrumentationBindings.cpp +++ b/llvm/bindings/go/llvm/InstrumentationBindings.cpp @@ -44,5 +44,5 @@ void LLVMAddDataFlowSanitizerPass(LLVMPassManagerRef PM, for (int i = 0; i != ABIListFilesNum; ++i) { ABIListFilesVec.push_back(ABIListFiles[i]); } - unwrap(PM)->add(createDataFlowSanitizerPass(ABIListFilesVec)); + unwrap(PM)->add(createDataFlowSanitizerLegacyPassPass(ABIListFilesVec)); } diff --git a/llvm/bindings/go/llvm/transforms_ipo.go b/llvm/bindings/go/llvm/transforms_ipo.go index 1dcb2af8bf243..8a158f208bef4 100644 --- a/llvm/bindings/go/llvm/transforms_ipo.go +++ b/llvm/bindings/go/llvm/transforms_ipo.go @@ -32,7 +32,6 @@ func (pm PassManager) AddFunctionAttrsPass() { C.LLVMAddFunctionAttrsPas func (pm PassManager) AddFunctionInliningPass() { C.LLVMAddFunctionInliningPass(pm.C) } func (pm PassManager) AddGlobalDCEPass() { C.LLVMAddGlobalDCEPass(pm.C) } func (pm PassManager) AddGlobalOptimizerPass() { C.LLVMAddGlobalOptimizerPass(pm.C) } -func (pm PassManager) AddIPConstantPropagationPass() { C.LLVMAddIPConstantPropagationPass(pm.C) } func (pm PassManager) AddPruneEHPass() { C.LLVMAddPruneEHPass(pm.C) } func (pm PassManager) AddIPSCCPPass() { C.LLVMAddIPSCCPPass(pm.C) } func (pm PassManager) AddInternalizePass(allButMain bool) { diff --git a/llvm/bindings/ocaml/transforms/ipo/ipo_ocaml.c b/llvm/bindings/ocaml/transforms/ipo/ipo_ocaml.c index 9fcaa10534f6b..c0e213714ed02 100644 --- a/llvm/bindings/ocaml/transforms/ipo/ipo_ocaml.c +++ b/llvm/bindings/ocaml/transforms/ipo/ipo_ocaml.c @@ -73,12 +73,6 @@ CAMLprim value llvm_add_global_optimizer(LLVMPassManagerRef PM) { return Val_unit; } -/* [`Module] Llvm.PassManager.t -> unit */ -CAMLprim value llvm_add_ip_constant_propagation(LLVMPassManagerRef PM) { - LLVMAddIPConstantPropagationPass(PM); - return Val_unit; -} - /* [`Module] Llvm.PassManager.t -> unit */ CAMLprim value llvm_add_prune_eh(LLVMPassManagerRef PM) { LLVMAddPruneEHPass(PM); diff --git a/llvm/bindings/ocaml/transforms/ipo/llvm_ipo.ml b/llvm/bindings/ocaml/transforms/ipo/llvm_ipo.ml index 1fb5594fcc7d6..d5d959288d332 100644 --- a/llvm/bindings/ocaml/transforms/ipo/llvm_ipo.ml +++ b/llvm/bindings/ocaml/transforms/ipo/llvm_ipo.ml @@ -33,9 +33,6 @@ external add_global_dce external add_global_optimizer : [ `Module ] Llvm.PassManager.t -> unit = "llvm_add_global_optimizer" -external add_ipc_propagation - : [ `Module ] Llvm.PassManager.t -> unit - = "llvm_add_ip_constant_propagation" external add_prune_eh : [ `Module ] Llvm.PassManager.t -> unit = "llvm_add_prune_eh" diff --git a/llvm/bindings/ocaml/transforms/ipo/llvm_ipo.mli b/llvm/bindings/ocaml/transforms/ipo/llvm_ipo.mli index 6507c5d92c2b1..a581924c6d549 100644 --- a/llvm/bindings/ocaml/transforms/ipo/llvm_ipo.mli +++ b/llvm/bindings/ocaml/transforms/ipo/llvm_ipo.mli @@ -56,11 +56,6 @@ external add_global_optimizer : [ `Module ] Llvm.PassManager.t -> unit = "llvm_add_global_optimizer" -(** See the [llvm::createIPConstantPropagationPass] function. *) -external add_ipc_propagation - : [ `Module ] Llvm.PassManager.t -> unit - = "llvm_add_ip_constant_propagation" - (** See the [llvm::createPruneEHPass] function. *) external add_prune_eh : [ `Module ] Llvm.PassManager.t -> unit diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index 333167bfb6b0d..2ff5c29c6b1a5 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -222,8 +222,13 @@ function(add_link_opts target_name) # Pass -O3 to the linker. This enabled different optimizations on different # linkers. if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin|SunOS|AIX" OR WIN32)) - set_property(TARGET ${target_name} APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,-O3") + # Before binutils 2.34, gold -O2 and above did not correctly handle R_386_GOTOFF to + # SHF_MERGE|SHF_STRINGS sections: https://sourceware.org/bugzilla/show_bug.cgi?id=16794 + if(LLVM_LINKER_IS_GOLD) + set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS " -Wl,-O1") + else() + set_property(TARGET ${target_name} APPEND_STRING PROPERTY LINK_FLAGS " -Wl,-O3") + endif() endif() if(LLVM_LINKER_IS_GOLD) @@ -1400,11 +1405,8 @@ function(add_unittest test_suite test_name) set(EXCLUDE_FROM_ALL ON) endif() - # Our current version of gtest does not properly recognize C++11 support - # with MSVC, so it falls back to tr1 / experimental classes. Since LLVM - # itself requires C++11, we can safely force it on unconditionally so that - # we don't have to fight with the buggy gtest check. - add_definitions(-DGTEST_LANG_CXX11=1) + # Our current version of gtest uses tr1/tuple which is deprecated on MSVC. + # Since LLVM itself requires C++14, we can safely force it off. add_definitions(-DGTEST_HAS_TR1_TUPLE=0) include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include) diff --git a/llvm/cmake/modules/FindGRPC.cmake b/llvm/cmake/modules/FindGRPC.cmake index 8a0ca593b2f4a..70d67ceb791d3 100644 --- a/llvm/cmake/modules/FindGRPC.cmake +++ b/llvm/cmake/modules/FindGRPC.cmake @@ -45,11 +45,25 @@ else() # system path. if (GRPC_HOMEBREW_RETURN_CODE EQUAL "0") include_directories(${GRPC_HOMEBREW_PATH}/include) - link_directories(${GRPC_HOMEBREW_PATH}/lib) + find_library(GRPC_LIBRARY + grpc++ + PATHS ${GRPC_HOMEBREW_PATH}/lib + NO_DEFAULT_PATH + REQUIRED) + add_library(grpc++ UNKNOWN IMPORTED GLOBAL) + set_target_properties(grpc++ PROPERTIES + IMPORTED_LOCATION ${GRPC_LIBRARY}) endif() if (PROTOBUF_HOMEBREW_RETURN_CODE EQUAL "0") include_directories(${PROTOBUF_HOMEBREW_PATH}/include) - link_directories(${PROTOBUF_HOMEBREW_PATH}/lib) + find_library(PROTOBUF_LIBRARY + protobuf + PATHS ${PROTOBUF_HOMEBREW_PATH}/lib + NO_DEFAULT_PATH + REQUIRED) + add_library(protobuf UNKNOWN IMPORTED GLOBAL) + set_target_properties(protobuf PROPERTIES + IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) endif() endif() endif() diff --git a/llvm/cmake/modules/FindLibpfm.cmake b/llvm/cmake/modules/FindLibpfm.cmake index 202bb030e3803..38cf3af3953d9 100644 --- a/llvm/cmake/modules/FindLibpfm.cmake +++ b/llvm/cmake/modules/FindLibpfm.cmake @@ -7,6 +7,7 @@ include(CheckIncludeFile) include(CheckLibraryExists) +include(CheckCXXSourceCompiles) if (LLVM_ENABLE_LIBPFM) check_library_exists(pfm pfm_initialize "" HAVE_LIBPFM_INITIALIZE) @@ -16,6 +17,19 @@ if (LLVM_ENABLE_LIBPFM) check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H) if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H) set(HAVE_LIBPFM 1) + # Check to see if perf_branch_entry has the field 'cycles'. + # We couldn't use CheckStructHasMember here because 'cycles' is a bit field which is not + # supported by CheckStructHasMember. + CHECK_CXX_SOURCE_COMPILES(" + #include + int main() { + perf_branch_entry entry; + entry.cycles = 2; + return 0; + }" COMPILE_WITH_CYCLES) + if(COMPILE_WITH_CYCLES) + set(LIBPFM_HAS_FIELD_CYCLES 1) + endif() endif() endif() endif() diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 62dd0ef79cf48..4feb4b7a7f300 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -261,7 +261,12 @@ if( LLVM_ENABLE_LLD ) if ( LLVM_USE_LINKER ) message(FATAL_ERROR "LLVM_ENABLE_LLD and LLVM_USE_LINKER can't be set at the same time") endif() - set(LLVM_USE_LINKER "lld") + # In case of MSVC cmake always invokes the linker directly, so the linker + # should be specified by CMAKE_LINKER cmake variable instead of by -fuse-ld + # compiler option. + if ( NOT MSVC ) + set(LLVM_USE_LINKER "lld") + endif() endif() if( LLVM_USE_LINKER ) @@ -416,6 +421,12 @@ if( MSVC ) append("/Zc:inline" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + # Some projects use the __cplusplus preprocessor macro to check support for + # a particular version of the C++ standard. When this option is not specified + # explicitly, macro's value is "199711L" that implies C++98 Standard. + # https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ + append("/Zc:__cplusplus" CMAKE_CXX_FLAGS) + # Allow users to request PDBs in release mode. CMake offeres the # RelWithDebInfo configuration, but it uses different optimization settings # (/Ob1 vs /Ob2 or -O2 vs -O3). LLVM provides this flag so that users can get @@ -468,6 +479,10 @@ if( MSVC ) endif() endif() endif() + # By default MSVC has a 2^16 limit on the number of sections in an object file, + # but in many objects files need more than that. This flag is to increase the + # number of sections. + append("/bigobj" CMAKE_CXX_FLAGS) endif( MSVC ) # Warnings-as-errors handling for GCC-compatible compilers: diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in index e729a839f614d..a5c370bbc25e4 100644 --- a/llvm/cmake/modules/LLVMConfig.cmake.in +++ b/llvm/cmake/modules/LLVMConfig.cmake.in @@ -7,6 +7,7 @@ set(LLVM_VERSION_MINOR @LLVM_VERSION_MINOR@) set(LLVM_VERSION_PATCH @LLVM_VERSION_PATCH@) set(LLVM_VERSION_SUFFIX @LLVM_VERSION_SUFFIX@) set(LLVM_PACKAGE_VERSION @PACKAGE_VERSION@) +set(LLVM_PACKAGE_BUGREPORT @PACKAGE_BUGREPORT@) set(LLVM_BUILD_TYPE @CMAKE_BUILD_TYPE@) @@ -50,6 +51,9 @@ set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@) set(LLVM_ENABLE_UNWIND_TABLES @LLVM_ENABLE_UNWIND_TABLES@) set(LLVM_ENABLE_ZLIB @LLVM_ENABLE_ZLIB@) +if(LLVM_ENABLE_ZLIB) + find_package(ZLIB) +endif() set(LLVM_LIBXML2_ENABLED @LLVM_LIBXML2_ENABLED@) diff --git a/llvm/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst similarity index 73% rename from llvm/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.rst rename to llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst index a2c48f97717a4..b6df0c92201ae 100644 --- a/llvm/docs/AMDGPUDwarfProposalForHeterogeneousDebugging.rst +++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst @@ -1,15 +1,15 @@ -.. _amdgpu-dwarf-proposal-for-heterogeneous-debugging: +.. _amdgpu-dwarf-extensions-for-heterogeneous-debugging: -****************************************** -DWARF Proposal For Heterogeneous Debugging -****************************************** +******************************************** +DWARF Extensions For Heterogeneous Debugging +******************************************** .. contents:: :local: .. warning:: - This document describes a **provisional proposal** for DWARF Version 6 + This document describes **provisional extensions** to DWARF Version 5 [:ref:`DWARF `] to support heterogeneous debugging. It is not currently fully implemented and is subject to change. @@ -37,43 +37,37 @@ and the Perforce TotalView HPC debugger [:ref:`Perforce-TotalView To support debugging heterogeneous programs several features that are not provided by current DWARF Version 5 [:ref:`DWARF `] have -been identified. This document contains a collection of proposals to address +been identified. This document contains a collection of extensions to address providing those features. The :ref:`amdgpu-dwarf-motivation` section describes the issues that are being addressed for heterogeneous computing. That is followed by the -:ref:`amdgpu-dwarf-proposed-changes-relative-to-dwarf-version-5` section -containing the proposed textual changes relative to the DWARF Version 5 -standard. Then there is an :ref:`amdgpu-dwarf-examples` section that links to -the AMD GPU specific usage of the features in the proposal that includes an -example. Finally, there is a :ref:`amdgpu-dwarf-references` section. There are a -number of notes included that raise open questions, or provide alternative -approaches considered. The draft proposal seeks to be general in nature and -backwards compatible with DWARF Version 5. Its goal is to be applicable to -meeting the needs of any heterogeneous system and not be vendor or architecture -specific. - -A fundamental aspect of the draft proposal is that it allows DWARF expression -location descriptions as stack elements. The draft proposal is based on DWARF +:ref:`amdgpu-dwarf-changes-relative-to-dwarf-version-5` section containing the +textual changes for the extensions relative to the DWARF Version 5 standard. +Then there is an :ref:`amdgpu-dwarf-examples` section that links to the AMD GPU +specific usage of the extensions that includes an example. Finally, there is a +:ref:`amdgpu-dwarf-references` section. There are a number of notes included +that raise open questions, or provide alternative approaches considered. The +extensions seek to be general in nature and backwards compatible with DWARF +Version 5. The goal is to be applicable to meeting the needs of any +heterogeneous system and not be vendor or architecture specific. + +A fundamental aspect of the extensions is that it allows DWARF expression +location descriptions as stack elements. The extensions are based on DWARF Version 5 and maintains compatibility with DWARF Version 5. After attempting -several alternatives, the current thinking is that such an addition to DWARF -Version 5 is the simplest and cleanest way to support debugging optimized GPU +several alternatives, the current thinking is that such extensions to DWARF +Version 5 are the simplest and cleanest ways to support debugging optimized GPU code. It also appears to be generally useful and may be able to address other reported DWARF issues, as well as being helpful in providing better optimization support for non-GPU code. -General feedback on this draft proposal is sought, together with suggestions on -how to clarify, simplify, or organize it before submitting it as a formal DWARF -proposal. The current draft proposal is large and may need to be split into -separate proposals before formal submission. Any suggestions on how best to do -that are appreciated. However, at the initial review stage it is believed there -is value in presenting a unified proposal as there are mutual dependencies -between the various parts that would not be as apparent if it was broken up into -separate independent proposals. +General feedback on these extensions is sought, together with suggestions on how +to clarify, simplify, or organize them. If their is general interest then some +or all of these extensions could be submitted as future DWARF proposals. -We are in the process of modifying LLVM and GDB to support this draft proposal +We are in the process of modifying LLVM and GDB to support these extensions which is providing experience and insights. We plan to upstream the changes to -those projects for any final form of the proposal. +those projects for any final form of the extensions. The author very much appreciates the input provided so far by many others which has been incorporated into this current version. @@ -83,11 +77,10 @@ has been incorporated into this current version. Motivation ========== -This document proposes a set of backwards compatible extensions to DWARF Version -5 [:ref:`DWARF `] for consideration of inclusion into a -future DWARF Version 6 standard to support heterogeneous debugging. +This document presents a set of backwards compatible extensions to DWARF Version +5 [:ref:`DWARF `] to support heterogeneous debugging. -The remainder of this section provides motivation for each proposed feature in +The remainder of this section provides motivation for each extension in terms of heterogeneous debugging on commercially available AMD GPU hardware (AMDGPU). The goal is to add support to the AMD [:ref:`AMD `] open source Radeon Open Compute Platform (ROCm) [:ref:`AMD-ROCm @@ -102,21 +95,21 @@ to work with third parties to enable support for AMDGPU debugging in the GCC compiler [:ref:`GCC `] and the Perforce TotalView HPC debugger [:ref:`Perforce-TotalView `]. -However, the proposal is intended to be vendor and architecture neutral. It is -believed to apply to other heterogeous hardware devices including GPUs, DSPs, -FPGAs, and other specialized hardware. These collectively include similar -characteristics and requirements as AMDGPU devices. Parts of the proposal can +However, the extensions are intended to be vendor and architecture neutral. They +are believed to apply to other heterogenous hardware devices including GPUs, +DSPs, FPGAs, and other specialized hardware. These collectively include similar +characteristics and requirements as AMDGPU devices. Some of the extension can also apply to traditional CPU hardware that supports large vector registers. Compilers can map source languages and extensions that describe large scale parallel execution onto the lanes of the vector registers. This is common in -programming languages used in ML and HPC. The proposal also includes improved +programming languages used in ML and HPC. The extensions also include improved support for optimized code on any architecture. Some of the generalizations may also benefit other issues that have been raised. -The proposal has evolved though collaboration with many individuals and active -prototyping within the GDB debugger and LLVM compiler. Input has also been very -much appreciated from the developers working on the Perforce TotalView HPC -Debugger and GCC compiler. +The extensions have evolved though collaboration with many individuals and +active prototyping within the GDB debugger and LLVM compiler. Input has also +been very much appreciated from the developers working on the Perforce TotalView +HPC Debugger and GCC compiler. The AMDGPU has several features that require additional DWARF functionality in order to support optimized code. @@ -162,11 +155,11 @@ required. See ``DW_AT_LLVM_vector_size``. If the source language is mapped onto the AMDGPU wavefronts in a SIMT manner, then the variable DWARF location expressions must compute the location for a -single lane of the wavefront. Therefore, a DWARF operation is required to -denote the current lane, much like ``DW_OP_push_object_address`` denotes the -current object. The ``DW_OP_*piece`` operations only allow literal indices. -Therefore, a way to use a computed offset of an arbitrary location description -(such as a vector register) is required. See ``DW_OP_LLVM_push_lane``, +single lane of the wavefront. Therefore, a DWARF operation is required to denote +the current lane, much like ``DW_OP_push_object_address`` denotes the current +object. The ``DW_OP_*piece`` operations only allow literal indices. Therefore, a +way to use a computed offset of an arbitrary location description (such as a +vector register) is required. See ``DW_OP_LLVM_push_lane``, ``DW_OP_LLVM_offset``, ``DW_OP_LLVM_offset_uconst``, and ``DW_OP_LLVM_bit_offset``. @@ -250,12 +243,12 @@ describe when bytes are in registers, are implicit, or a composite of these, the define that integer overflow causes wrap-around. The offset operations can operate on location storage of any size. For example, implicit location storage could be any number of bits in size. It is simpler to define offsets that exceed -the size of the location storage as being ill-formed, than having to force an -implementation to support potentially infinite precision offsets to allow it to -correctly track a series of positive and negative offsets that may transiently -overflow or underflow, but end up in range. This is simple for the arithmetic -operations as they are defined in terms of two's compliment arithmetic on a base -type of a fixed size. +the size of the location storage as being an evaluation error, than having to +force an implementation to support potentially infinite precision offsets to +allow it to correctly track a series of positive and negative offsets that may +transiently overflow or underflow, but end up in range. This is simple for the +arithmetic operations as they are defined in terms of two's compliment +arithmetic on a base type of a fixed size. Having the offset operations allows ``DW_OP_push_object_address`` to push a location description that may be in a register, or be an implicit value, and the @@ -276,7 +269,7 @@ larger than the generic type. See ``DW_OP_LLVM_form_aspace_address``. If the ``DW_OP_LLVM_form_aspace_address`` operation had to produce a value that can be implicitly converted to a memory location description, then it would be limited to the size of the generic type which matches the size of the -default address space. Its value would be unspecified and likely not match any +default address space. Its value would be undefined and likely not match any value in the actual program. By making the result a location description, it allows a consumer great freedom in how it implements it. The implicit conversion back to a value can be limited only to the default address space to @@ -317,7 +310,7 @@ consumer on how to implement them. They allow the address space to encode lane information so they can be used to read memory with only the memory description and no extra arguments. The same set of operations can operate on locations independent of their kind of storage. The ``DW_OP_deref*`` therefore -can be used on any storage kind. ``DW_OP_xderef*`` is unnecessary except to +can be used on any storage kind. ``DW_OP_xderef*`` is unnecessary, except to become a more compact way to convert a non-default address space address followed by dereferencing it. @@ -332,8 +325,8 @@ attributes such as ``DW_AT_data_member_location``, ``DW_AT_use_location``, and on the expression stack before evaluating the expression. However, DWARF Version 5 only allows the stack to contain values and so only a single memory address can be on the stack which makes these incapable of handling location -descriptions with multiple places, or places other than memory. Since this -proposal allows the stack to contain location descriptions, the operations are +descriptions with multiple places, or places other than memory. Since these +extensions allow the stack to contain location descriptions, the operations are generalized to support location descriptions that can have multiple places. This is backwards compatible with DWARF Version 5 and allows objects with multiple places to be supported. For example, the expression that describes @@ -345,8 +338,8 @@ unified into a single section that describes DWARF expressions in general. This unification seems to be a natural consequence and a necessity of allowing location descriptions to be part of the evaluation stack. -For those familiar with the definition of location descriptions in DWARF -Version 5, the definition in this proposal is presented differently, but does +For those familiar with the definition of location descriptions in DWARF Version +5, the definitions in these extensions are presented differently, but does in fact define the same concept with the same fundamental semantics. However, it does so in a way that allows the concept to extend to support address spaces, bit addressing, the ability for composite location descriptions to be @@ -354,7 +347,7 @@ composed of any kind of location description, and the ability to support objects located at multiple places. Collectively these changes expand the set of processors that can be supported and improves support for optimized code. -Several approaches were considered, and the one proposed appears to be the +Several approaches were considered, and the one presented appears to be the cleanest and offers the greatest improvement of DWARF's ability to support optimized code. Examining the GDB debugger and LLVM compiler, it appears only to require modest changes as they both already have to support general use of @@ -412,22 +405,22 @@ style based on the DWARF Version 5 specification. Non-normative text is shown in *italics*. The names for the new operations, attributes, and constants include "\ -``LLVM``\ " and are encoded with vendor specific codes so this proposal can be -implemented as an LLVM vendor extension to DWARF Version 5. If accepted these +``LLVM``\ " and are encoded with vendor specific codes so these extensions can +be implemented as an LLVM vendor extension to DWARF Version 5. If accepted these names would not include the "\ ``LLVM``\ " and would not use encodings in the vendor range. -The proposal is described in -:ref:`amdgpu-dwarf-proposed-changes-relative-to-dwarf-version-5` and is +The extensions are described in +:ref:`amdgpu-dwarf-changes-relative-to-dwarf-version-5` and are organized to follow the section ordering of DWARF Version 5. It includes notes to indicate the corresponding DWARF Version 5 sections to which they pertain. Other notes describe additional changes that may be worth considering, and to raise questions. -.. _amdgpu-dwarf-proposed-changes-relative-to-dwarf-version-5: +.. _amdgpu-dwarf-changes-relative-to-dwarf-version-5: -Proposed Changes Relative to DWARF Version 5 -============================================ +Changes Relative to DWARF Version 5 +=================================== General Description ------------------- @@ -462,10 +455,10 @@ DWARF Expressions .. note:: - This section, and its nested sections, replaces DWARF Version 5 section 2.5 and - section 2.6. The new proposed DWARF expression operations are defined as well - as clarifying the extensions to already existing DWARF Version 5 operations. It is - based on the text of the existing DWARF Version 5 standard. + This section, and its nested sections, replaces DWARF Version 5 section 2.5 + and section 2.6. The new DWARF expression operation extensions are defined as + well as clarifying the extensions to already existing DWARF Version 5 + operations. It is based on the text of the existing DWARF Version 5 standard. DWARF expressions describe how to compute a value or specify a location. @@ -473,155 +466,215 @@ DWARF expressions describe how to compute a value or specify a location. value of an array bound, the length of a dynamic string, the desired value itself, and so on.* -The evaluation of a DWARF expression can either result in a value or a location -description: +If the evaluation of a DWARF expression does not encounter an error, then it can +either result in a value (see :ref:`amdgpu-dwarf-expression-value`) or a +location description (see :ref:`amdgpu-dwarf-location-description`). When a +DWARF expression is evaluated, it may be specified whether a value or location +description is required as the result kind. -*value* +If a result kind is specified, and the result of the evaluation does not match +the specified result kind, then the implicit conversions described in +:ref:`amdgpu-dwarf-memory-location-description-operations` are performed if +valid. Otherwise, the DWARF expression is ill-formed. - A value has a type and a literal value. It can represent a literal value of - any supported base type of the target architecture. The base type specifies - the size and encoding of the literal value. +If the evaluation of a DWARF expression encounters an evaluation error, then the +result is an evaluation error. - .. note:: +.. note:: - It may be desirable to add an implicit pointer base type encoding. It would - be used for the type of the value that is produced when the ``DW_OP_deref*`` - operation retrieves the full contents of an implicit pointer location - storage created by the ``DW_OP_implicit_pointer`` or - ``DW_OP_LLVM_aspace_implicit_pointer`` operations. The literal value would - record the debugging information entry and byte dispacement specified by the - associated ``DW_OP_implicit_pointer`` or - ``DW_OP_LLVM_aspace_implicit_pointer`` operations. + Decided to define the concept of an evaluation error. An alternative is to + introduce an undefined value base type in a similar way to location + descriptions having an undefined location description. Then operations that + encounter an evaluation error can return the undefined location description or + value with an undefined base type. - Instead of a base type, a value can have a distinguished generic type, which - is an integral type that has the size of an address in the target architecture - default address space and unspecified signedness. + All operations that act on values would return an undefined entity if given an + undefined value. The expression would then always evaluate to completion, and + can be tested to determine if it is an undefined entity. - *The generic type is the same as the unspecified type used for stack - operations defined in DWARF Version 4 and before.* + However, this would add considerable additional complexity and does not match + that GDB throws an exception when these evaluation errors occur. - An integral type is a base type that has an encoding of ``DW_ATE_signed``, - ``DW_ATE_signed_char``, ``DW_ATE_unsigned``, ``DW_ATE_unsigned_char``, - ``DW_ATE_boolean``, or any target architecture defined integral encoding in - the inclusive range ``DW_ATE_lo_user`` to ``DW_ATE_hi_user``. +If a DWARF expression is ill-formed, then the result is undefined. - .. note:: +The following sections detail the rules for when a DWARF expression is +ill-formed or results in an evaluation error. - It is unclear if ``DW_ATE_address`` is an integral type. GDB does not seem - to consider it as integral. +A DWARF expression can either be encoded as a operation expression (see +:ref:`amdgpu-dwarf-operation-expressions`), or as a location list expression +(see :ref:`amdgpu-dwarf-location-list-expressions`). -*location description* +.. _amdgpu-dwarf-expression-evaluation-context: - *Debugging information must provide consumers a way to find the location of - program variables, determine the bounds of dynamic arrays and strings, and - possibly to find the base address of a subprogram’s stack frame or the return - address of a subprogram. Furthermore, to meet the needs of recent computer - architectures and optimization techniques, debugging information must be able - to describe the location of an object whose location changes over the object’s - lifetime, and may reside at multiple locations simultaneously during parts of - an object's lifetime.* +DWARF Expression Evaluation Context ++++++++++++++++++++++++++++++++++++ - Information about the location of program objects is provided by location - descriptions. +A DWARF expression is evaluated in a context that can include a number of +context elements. If multiple context elements are specified then they must be +self consistent or the result of the evaluation is undefined. The context +elements that can be specified are: - Location descriptions can consist of one or more single location descriptions. +*A current result kind* - A single location description specifies the location storage that holds a - program object and a position within the location storage where the program - object starts. The position within the location storage is expressed as a bit - offset relative to the start of the location storage. + The kind of result required by the DWARF expression evaluation. If specified + it can be a location description or a value. - A location storage is a linear stream of bits that can hold values. Each - location storage has a size in bits and can be accessed using a zero-based bit - offset. The ordering of bits within a location storage uses the bit numbering - and direction conventions that are appropriate to the current language on the - target architecture. +*A current thread* - There are five kinds of location storage: + The target architecture thread identifier of the source program thread of + execution for which a user presented expression is currently being evaluated. - *memory location storage* - Corresponds to the target architecture memory address spaces. + It is required for operations that are related to target architecture threads. - *register location storage* - Corresponds to the target architecture registers. + *For example, the* ``DW_OP_form_tls_address`` *operation and* + ``DW_OP_LLVM_form_aspace_address`` *operation when given an address space that + is thread specific.* - *implicit location storage* - Corresponds to fixed values that can only be read. +*A current lane* - *undefined location storage* - Indicates no value is available and therefore cannot be read or written. + The target architecture lane identifier of the source program thread of + execution for which a user presented expression is currently being evaluated. + This applies to languages that are implemented using a SIMD or SIMT execution + model. - *composite location storage* - Allows a mixture of these where some bits come from one location storage and - some from another location storage, or from disjoint parts of the same - location storage. + It is required for operations that are related to target architecture lanes. - .. note:: + *For example, the* ``DW_OP_LLVM_push_lane`` *operation and* + ``DW_OP_LLVM_form_aspace_address`` *operation when given an address space that + is lane specific.* - It may be better to add an implicit pointer location storage kind used by - the ``DW_OP_implicit_pointer`` and ``DW_OP_LLVM_aspace_implicit_pointer`` - operations. It would specify the debugger information entry and byte offset - provided by the operations. - - *Location descriptions are a language independent representation of addressing - rules. They are created using DWARF operation expressions of arbitrary - complexity. They can be the result of evaluting a debugger information entry - attribute that specifies an operation expression. In this usage they can - describe the location of an object as long as its lifetime is either static or - the same as the lexical block (see DWARF Version 5 section 3.5) that owns it, - and it does not move during its lifetime. They can be the result of evaluating - a debugger information entry attribute that specifies a location list - expression. In this usage they can describe the location of an object that has - a limited lifetime, changes its location during its lifetime, or has multiple - locations over part or all of its lifetime.* - - If a location description has more than one single location description, the - DWARF expression is ill-formed if the object value held in each single - location description's position within the associated location storage is not - the same value, except for the parts of the value that are uninitialized. - - *A location description that has more than one single location description can - only be created by a location list expression that has overlapping program - location ranges, or certain expression operations that act on a location - description that has more than one single location description. There are no - operation expression operations that can directly create a location - description with more than one single location description.* - - *A location description with more than one single location description can be - used to describe objects that reside in more than one piece of storage at the - same time. An object may have more than one location as a result of - optimization. For example, a value that is only read may be promoted from - memory to a register for some region of code, but later code may revert to - reading the value from memory as the register may be used for other purposes. - For the code region where the value is in a register, any change to the object - value must be made in both the register and the memory so both regions of code - will read the updated value.* - - *A consumer of a location description with more than one single location - description can read the object's value from any of the single location - descriptions (since they all refer to location storage that has the same - value), but must write any changed value to all the single location - descriptions.* + If specified, it must be consistent with any specified current thread and + current target architecture. It is consistent with a thread if it identifies a + lane of the thread. It is consistent with a target architecture if it is a + valid lane identifier of the target architecture. Otherwise the result is + undefined. -A DWARF expression can either be encoded as a operation expression (see -:ref:`amdgpu-dwarf-operation-expressions`), or as a location list expression -(see :ref:`amdgpu-dwarf-location-list-expressions`). +*A current call frame* -A DWARF expression is evaluated in the context of: - -*A current subprogram* - This may be used in the evaluation of register access operations to support - virtual unwinding of the call stack (see + The target architecture call frame identifier. It identifies a call frame that + corresponds to an active invocation of a subprogram in the current thread. It + is identified by its address on the call stack. The address is referred to as + the Canonical Frame Address (CFA). The call frame information is used to + determine the CFA for the call frames of the current thread's call stack (see :ref:`amdgpu-dwarf-call-frame-information`). + It is required for operations that specify target architecture registers to + support virtual unwinding of the call stack. + + *For example, the* ``DW_OP_*reg*`` *operations.* + + If specified, it must be an active call frame in the current thread. If the + current lane is specified, then that lane must have been active on entry to + the call frame (see the ``DW_AT_LLVM_lane_pc`` attribute). Otherwise the + result is undefined. + + If it is the currently executing call frame, then it is termed the top call + frame. + *A current program location* - This may be used in the evaluation of location list expressions to select - amongst multiple program location ranges. It should be the program location - corresponding to the current subprogram. If the current subprogram was reached - by virtual call stack unwinding, then the program location will correspond to - the associated call site. + + The target architecture program location corresponding to the current call + frame of the current thread. + + The program location of the top call frame is the target architecture program + counter for the current thread. The call frame information is used to obtain + the value of the return address register to determine the program location of + the other call frames (see :ref:`amdgpu-dwarf-call-frame-information`). + + It is required for the evaluation of location list expressions to select + amongst multiple program location ranges. It is required for operations that + specify target architecture registers to support virtual unwinding of the call + stack (see :ref:`amdgpu-dwarf-call-frame-information`). + + If specified: + + * If the current lane is not specified: + + * If the current call frame is the top call frame, it must be the current + target architecture program location. + + * If the current call frame F is not the top call frame, it must be the + program location associated with the call site in the current caller frame + F that invoked the callee frame. + + * If the current lane is specified and the architecture program location LPC + computed by the ``DW_AT_LLVM_lane_pc`` attribute for the current lane is not + the undefined location description (indicating the lane was not active on + entry to the call frame), it must be LPC. + + * Otherwise the result is undefined. + +*A current compilation unit* + + The compilation unit debug information entry that contains the DWARF expression + being evaluated. + + It is required for operations that reference debug information associated with + the same compilation unit, including indicating if such references use the + 32-bit or 64-bit DWARF format. It can also provide the default address space + address size if no current target architecture is specified. + + *For example, the* ``DW_OP_constx`` *and* ``DW_OP_addrx`` *operations.* + + *Note that this compilation unit may not be the same as the compilation unit + determined from the loaded code object corresponding to the current program + location. For example, the evaluation of the expression E associated with a + ``DW_AT_location`` attribute of the debug information entry operand of the + ``DW_OP_call*`` operations is evaluated with the compilation unit that + contains E and not the one that contains the ``DW_OP_call*`` operation + expression.* + +*A current target architecture* + + The target architecture. + + It is required for operations that specify target architecture specific + entities. + + *For example, target architecture specific entities include DWARF register + identifiers, DWARF lane identifiers, DWARF address space identifiers, the + default address space, and the address space address sizes.* + + If specified: + + * If the current thread is specified, then the current target architecture + must be the same as the target architecture of the current thread. + + * If the current compilation unit is specified, then the current target + architecture default address space address size must be the same as he + ``address_size`` field in the header of the current compilation unit and any + associated entry in the ``.debug_aranges`` section. + + * If the current program location is specified, then the current target + architecture must be the same as the target architecture of any line number + information entry (see :ref:`amdgpu-dwarf-line-number-information`) + corresponding to the current program location. + + * If the current program location is specified, then the current target + architecture default address space address size must be the same as he + ``address_size`` field in the header of any entry corresponding to the + current program location in the ``.debug_addr``, ``.debug_line``, + ``.debug_rnglists``, ``.debug_rnglists.dwo``, ``.debug_loclists``, and + ``.debug_loclists.dwo`` sections. + + * Otherwise the result is undefined. + +*A current object* + + The location description of a program object. + + It is required for the ``DW_OP_push_object_address`` operation. + + *For example, the* ``DW_AT_data_location`` *attribute on type debug + information entries specifies the the program object corresponding to a + runtime descriptor as the current object when it evaluates its associated + expression.* + + The result is undefined if the location descriptor is invalid (see + :ref:`amdgpu-dwarf-location-description`). *An initial stack* + This is a list of values or location descriptions that will be pushed on the operation expression evaluation stack in the order provided before evaluation of an operation expression starts. @@ -630,13 +683,204 @@ A DWARF expression is evaluated in the context of: expression value with initial stack entries. In all other cases the initial stack is empty. -When a DWARF expression is evaluated, it may be specified whether a value or -location description is required as the result kind. + The result is undefined if any location descriptors are invalid (see + :ref:`amdgpu-dwarf-location-description`). -If a result kind is specified, and the result of the evaluation does not match -the specified result kind, then the implicit conversions described in -:ref:`amdgpu-dwarf-memory-location-description-operations` are performed if -valid. Otherwise, the DWARF expression is ill-formed. +If the evaluation requires a context element that is not specified, then the +result of the evaluation is an error. + +*A DWARF expression for the location description may be able to be evaluated +without a thread, lane, call frame, program location, or architecture context. +For example, the location of a global variable may be able to be evaluated +without such context. If the expression evaluates with an error then it may +indicate the variable has been optimized and so requires more context.* + +*The DWARF expression for call frame information (see +:ref:`amdgpu-dwarf-call-frame-information`) operations are restricted to those +that do not require the compilation unit context to be specified.* + +The DWARF is ill-formed if all the ``address_size`` fields in the headers of all +the entries in the ``.debug_info``, ``.debug_addr``, ``.debug_line``, +``.debug_rnglists``, ``.debug_rnglists.dwo``, ``.debug_loclists``, and +``.debug_loclists.dwo`` sections corresponding to any given program location do +not match. + +.. _amdgpu-dwarf-expression-value: + +DWARF Expression Value +++++++++++++++++++++++ + +A value has a type and a literal value. It can represent a literal value of any +supported base type of the target architecture. The base type specifies the size +and encoding of the literal value. + +.. note:: + + It may be desirable to add an implicit pointer base type encoding. It would be + used for the type of the value that is produced when the ``DW_OP_deref*`` + operation retrieves the full contents of an implicit pointer location storage + created by the ``DW_OP_implicit_pointer`` or + ``DW_OP_LLVM_aspace_implicit_pointer`` operations. The literal value would + record the debugging information entry and byte displacement specified by the + associated ``DW_OP_implicit_pointer`` or + ``DW_OP_LLVM_aspace_implicit_pointer`` operations. + +There is a distinguished base type termed the generic type, which is an integral +type that has the size of an address in the target architecture default address +space and unspecified signedness. + +*The generic type is the same as the unspecified type used for stack operations +defined in DWARF Version 4 and before.* + +An integral type is a base type that has an encoding of ``DW_ATE_signed``, +``DW_ATE_signed_char``, ``DW_ATE_unsigned``, ``DW_ATE_unsigned_char``, +``DW_ATE_boolean``, or any target architecture defined integral encoding in the +inclusive range ``DW_ATE_lo_user`` to ``DW_ATE_hi_user``. + +.. note:: + + It is unclear if ``DW_ATE_address`` is an integral type. GDB does not seem to + consider it as integral. + +.. _amdgpu-dwarf-location-description: + +DWARF Location Description +++++++++++++++++++++++++++ + +*Debugging information must provide consumers a way to find the location of +program variables, determine the bounds of dynamic arrays and strings, and +possibly to find the base address of a subprogram’s call frame or the return +address of a subprogram. Furthermore, to meet the needs of recent computer +architectures and optimization techniques, debugging information must be able to +describe the location of an object whose location changes over the object’s +lifetime, and may reside at multiple locations simultaneously during parts of an +object's lifetime.* + +Information about the location of program objects is provided by location +descriptions. + +Location descriptions can consist of one or more single location descriptions. + +A single location description specifies the location storage that holds a +program object and a position within the location storage where the program +object starts. The position within the location storage is expressed as a bit +offset relative to the start of the location storage. + +A location storage is a linear stream of bits that can hold values. Each +location storage has a size in bits and can be accessed using a zero-based bit +offset. The ordering of bits within a location storage uses the bit numbering +and direction conventions that are appropriate to the current language on the +target architecture. + +There are five kinds of location storage: + +*memory location storage* + Corresponds to the target architecture memory address spaces. + +*register location storage* + Corresponds to the target architecture registers. + +*implicit location storage* + Corresponds to fixed values that can only be read. + +*undefined location storage* + Indicates no value is available and therefore cannot be read or written. + +*composite location storage* + Allows a mixture of these where some bits come from one location storage and + some from another location storage, or from disjoint parts of the same + location storage. + +.. note:: + + It may be better to add an implicit pointer location storage kind used by the + ``DW_OP_implicit_pointer`` and ``DW_OP_LLVM_aspace_implicit_pointer`` + operations. It would specify the debugger information entry and byte offset + provided by the operations. + +*Location descriptions are a language independent representation of addressing +rules. They are created using DWARF operation expressions of arbitrary +complexity. They can be the result of evaluating a debugger information entry +attribute that specifies an operation expression. In this usage they can +describe the location of an object as long as its lifetime is either static or +the same as the lexical block (see DWARF Version 5 section 3.5) that owns it, +and it does not move during its lifetime. They can be the result of evaluating a +debugger information entry attribute that specifies a location list expression. +In this usage they can describe the location of an object that has a limited +lifetime, changes its location during its lifetime, or has multiple locations +over part or all of its lifetime.* + +If a location description has more than one single location description, the +DWARF expression is ill-formed if the object value held in each single location +description's position within the associated location storage is not the same +value, except for the parts of the value that are uninitialized. + +*A location description that has more than one single location description can +only be created by a location list expression that has overlapping program +location ranges, or certain expression operations that act on a location +description that has more than one single location description. There are no +operation expression operations that can directly create a location description +with more than one single location description.* + +*A location description with more than one single location description can be +used to describe objects that reside in more than one piece of storage at the +same time. An object may have more than one location as a result of +optimization. For example, a value that is only read may be promoted from memory +to a register for some region of code, but later code may revert to reading the +value from memory as the register may be used for other purposes. For the code +region where the value is in a register, any change to the object value must be +made in both the register and the memory so both regions of code will read the +updated value.* + +*A consumer of a location description with more than one single location +description can read the object's value from any of the single location +descriptions (since they all refer to location storage that has the same value), +but must write any changed value to all the single location descriptions.* + +The evaluation of an expression may require context elements to create a +location description. If such a location description is accessed, the storage it +denotes is that associated with the context element values specified when the +location description was created, which may differ from the context at the time +it is accessed. + +*For example, creating a register location description requires the thread +context: the location storage is for the specified register of that thread. +Creating a memory location description for an address space may required a +thread and a lane context: the location storage is the memory associated with +that thread and lane.* + +If any of the context elements required to create a location description change, +the location description becomes invalid and accessing it is undefined. + +*Examples of context that can invalidate a location description are:* + +* *The thread context is required and execution causes the thread to terminate.* +* *The call frame context is required and further execution causes the call + frame to return to the calling frame.* +* *The program location is required and further execution of the thread occurs. + That could change the location list entry or call frame information entry that + applies.* +* *An operation uses call frame information:* + + * *Any of the frames used in the virtual call frame unwinding return.* + * *The top call frame is used, the program location is used to select the call + frame information entry, and further execution of the thread occurs.* + +*A DWARF expression can be used to compute a location description for an object. +A subsequent DWARF expression evaluation can be given the object location +description as the object context or initial stack context to compute a +component of the object. The final result is undefined if the object location +description becomes invalid between the two expression evaluations.* + +A change of a thread's program location may not make a location description +invalid, yet may still render it as no longer meaningful. Accessing such a +location description, or using it as the object context or initial stack context +of an expression evaluation, may produce an undefined result. + +*For example, a location description may specify a register that no longer holds +the intended program object after a program location change. One way to avoid +such problems is to recompute location descriptions associated with threads when +their program locations change.* .. _amdgpu-dwarf-operation-expressions: @@ -658,10 +902,16 @@ operation expression is ill-formed. Evaluation of an operation expression starts with an empty stack on which the entries from the initial stack provided by the context are pushed in the order provided. Then the operations are evaluated, starting with the first operation -of the stream, until one past the last operation of the stream is reached. The -result of the evaluation is: +of the stream. Evaluation continues until either an operation has an evaluation +error, or until one past the last operation of the stream is reached. + +The result of the evaluation is: -* If evaluation of the DWARF expression requires a location description, then: +* If an operation has an evaluation error, or an operation evaluates an + expression that has an evaluation error, then the result is an evaluation + error. + +* If the current result kind specifies a location description, then: * If the stack is empty, the result is a location description with one undefined location description. @@ -682,7 +932,7 @@ result of the evaluation is: Could define this case as returning an implicit location description as if the ``DW_OP_implicit`` operation is performed. -* If evaluation of the DWARF expression requires a value, then: +* If the current result kind specifies a value, then: * If the top stack entry is a value, or can be converted to one (see :ref:`amdgpu-dwarf-memory-location-description-operations`), then the result @@ -691,8 +941,7 @@ result of the evaluation is: * Otherwise the DWARF expression is ill-formed. -* If evaluation of the DWARF expression does not specify if a value or location - description is required, then: +* If the current result kind is not specified, then: * If the stack is empty, the result is a location description with one undefined location description. @@ -837,24 +1086,43 @@ expression. ``DW_OP_call2``, ``DW_OP_call4``, and ``DW_OP_call_ref`` perform DWARF procedure calls during evaluation of a DWARF expression. - ``DW_OP_call2`` and ``DW_OP_call4``, have one operand that is a 2- or 4-byte - unsigned offset, respectively, of a debugging information entry D in the - current compilation unit. + ``DW_OP_call2`` and ``DW_OP_call4``, have one operand that is, respectively, + a 2-byte or 4-byte unsigned offset DR that represents the byte offset of a + debugging information entry D relative to the beginning of the current + compilation unit. ``DW_OP_call_ref`` has one operand that is a 4-byte unsigned value in the 32-bit DWARF format, or an 8-byte unsigned value in the 64-bit DWARF format, - that represents an offset of a debugging information entry D in a - ``.debug_info`` section, which may be contained in an executable or shared - object file other than that containing the operation. For references from - one executable or shared object file to another, the relocation must be - performed by the consumer. + that represents the byte offset DR of a debugging information entry D + relative to the beginning of the ``.debug_info`` section that contains the + current compilation unit. D may not be in the current compilation unit. .. note: - It is unclear how crossing from one executable or shared object file to - another can work. How would a consumer know which executable or shared - object file is being referenced? In an ELF file the DWARF is in a - non-ALLOC segment so standard dynamic relocations cannot be used. + DWARF Version 5 states that DR can be an offset in a ``.debug_info`` + section other than the one that contains the current compilation unit. It + states that relocation of references from one executable or shared object + file to another must be performed by the consumer. But given that DR is + defined as an offset in a ``.debug_info`` section this seems impossible. + If DR was defined as an implementation defined value, then the consumer + could choose to interpret the value in an implementation defined manner to + reference a debug information in another executable or shared object. + + In ELF the ``.debug_info`` section is in a non-\ ``PT_LOAD`` segment so + standard dynamic relocations cannot be used. But even if they were loaded + segments and dynamic relocations were used, DR would need to be the + address of D, not an offset in a ``.debug_info`` section. That would also + need DR to be the size of a global address. So it would not be possible to + use the 32-bit DWARF format in a 64-bit global address space. In addition, + the consumer would need to determine what executable or shared object the + relocated address was in so it could determine the containing compilation + unit. + + GDB only interprets DR as an offset in the ``.debug_info`` section that + contains the current compilation unit. + + This comment also applies to ``DW_OP_implicit_pointer`` and + ``DW_OP_LLVM_aspace_implicit_pointer``. *Operand interpretation of* ``DW_OP_call2``\ *,* ``DW_OP_call4``\ *, and* ``DW_OP_call_ref`` *is exactly like that for* ``DW_FORM_ref2``\ *, @@ -867,8 +1135,13 @@ expression. operation expression continues from the first operation of E. Execution continues until one past the last operation of E is reached, at which point execution continues with the operation following the call operation. - Since E is evaluated on the same stack as the call, E can use, add, and/or - remove entries already on the stack. + The operations of E are evaluated with the same current context, except + current compilation unit is the one that contains D and the stack is the + same as that being used by the call operation. After the call operation + has been evaluated, the stack is therefore as it is left by the evaluation + of the operations of E. Since E is evaluated on the same stack as the call + operation, E can use, and/or remove entries already on the stack, and can + add new entries to the stack. *Values on the stack at the time of the call may be used as parameters by the called expression and values left on the stack by the called expression @@ -877,10 +1150,10 @@ expression. * If D has a ``DW_AT_location`` attribute that is encoded as a ``loclist`` or ``loclistsptr``, then the specified location list expression E is - evaluated, and the resulting location description is pushed on the stack. - The evaluation of E uses a context that has the same current frame and - current program location as the current operation expression, but an empty - initial stack. + evaluated. The evaluation of E uses the current context, except the result + kind is a location description, the compilation unit is the one that + contains D, and the initial stack is empty. The location description + result is pushed on the stack. .. note:: @@ -909,7 +1182,7 @@ expression. GDB implements ``DW_OP_call*`` by always executing E on the same stack. If the location list has multiple matching entries, it simply picks the - first one and ignores the rest. This seems fundementally at odds with + first one and ignores the rest. This seems fundamentally at odds with the desire to supporting multiple places for variables. So, it feels like ``DW_OP_call*`` should both support pushing a location @@ -930,7 +1203,7 @@ expression. ``exprproc``, ``loclistproc``, and ``loclistsptrproc`` to indicate that the expression is executed on the same stack. ``exprproc`` is the same encoding as ``exprloc``. ``loclistproc`` and ``loclistsptrproc`` are the - same encoding as their non-\ ``proc`` counterparts except the DWARF is + same encoding as their non-\ ``proc`` counterparts, except the DWARF is ill-formed if the location list does not match exactly one location list entry and a default entry is required. These forms indicate explicitly that the matched single operation expression must be executed on the @@ -951,10 +1224,10 @@ expression. information entry. The DWARF would be ill-formed if E is a location list expression that does not match exactly one location list entry. In all other cases the evaluation of an expression E that is the value of a - ``DW_AT_location`` attribute would evaluate E with a context that has - the same current frame and current program location as the current - operation expression, but an empty initial stack, and push the resulting - location description on the stack. + ``DW_AT_location`` attribute would evaluate E with the current context, + except the result kind is a location description, the compilation unit + is the one that contains D, and the initial stack is empty. The location + description result is pushed on the stack. * If D has a ``DW_AT_const_value`` attribute with a value V, then it is as if a ``DW_OP_implicit_value V`` operation was executed. @@ -1070,18 +1343,18 @@ size and the low-order bits used. 9. ``DW_OP_const_type`` ``DW_OP_const_type`` has three operands. The first is an unsigned LEB128 - integer that represents the offset of a debugging information entry D in the - current compilation unit, that provides the type of the constant value. The - second is a 1-byte unsigned integral constant S. The third is a block of - bytes B, with a length equal to S. + integer DR that represents the byte offset of a debugging information entry + D relative to the beginning of the current compilation unit, that provides + the type T of the constant value. The second is a 1-byte unsigned integral + constant S. The third is a block of bytes B, with a length equal to S. - T is the bit size of the type D. The least significant T bits of B are + TS is the bit size of the type T. The least significant TS bits of B are interpreted as a value V of the type D. It pushes the value V with the type D. The DWARF is ill-formed if D is not a ``DW_TAG_base_type`` debugging - information entry, or if T divided by 8 and rounded up to a multiple of 8 - (the byte size) is not equal to S. + information entry in the current compilation unit, or if TS divided by 8 + (the byte size) and rounded up to a whole number is not equal to S. *While the size of the byte block B can be inferred from the type D definition, it is encoded explicitly into the operation so that the @@ -1090,9 +1363,8 @@ size and the low-order bits used. 10. ``DW_OP_LLVM_push_lane`` *New* - ``DW_OP_LLVM_push_lane`` pushes a value with the generic type that is the - target architecture specific lane identifier of the thread of execution for - which a user presented expression is currently being evaluated. + ``DW_OP_LLVM_push_lane`` pushes the target architecture lane identifier of + the current lane as a value with the generic type. *For languages that are implemented using a SIMD or SIMT execution model, this is the lane number that corresponds to the source language thread of @@ -1127,123 +1399,138 @@ There are these special value operations currently defined: ``DW_OP_regval_type`` has two operands. The first is an unsigned LEB128 integer that represents a register number R. The second is an unsigned - LEB128 integer that represents the offset of a debugging information entry D - in the current compilation unit, that provides the type of the register - value. + LEB128 integer DR that represents the byte offset of a debugging information + entry D relative to the beginning of the current compilation unit, that + provides the type T of the register value. - The contents of register R are interpreted as a value V of the type D. The - value V is pushed on the stack with the type D. - - The DWARF is ill-formed if D is not a ``DW_TAG_base_type`` debugging - information entry, or if the size of type D is not the same as the size of - register R. + The operation is equivalent to performing ``DW_OP_regx R; DW_OP_deref_type + DR``. .. note:: - Should DWARF allow the type D to be a different size to the size of the - register R? Requiring them to be the same bit size avoids any issue of - conversion as the bit contents of the register is simply interpreted as a - value of the specified type. If a conversion is wanted it can be done + Should DWARF allow the type T to be a larger size than the size of the + register R? Restricting a larger bit size avoids any issue of conversion + as the, possibly truncated, bit contents of the register is simply + interpreted as a value of T. If a conversion is wanted it can be done explicitly using a ``DW_OP_convert`` operation. GDB has a per register hook that allows a target specific conversion on a - register by register basis. It defaults to truncation of bigger registers, - and to actually reading bytes from the next register (or reads out of - bounds for the last register) for smaller registers. There are no GDB - tests that read a register out of bounds (except an illegal hand written - assembly test). + register by register basis. It defaults to truncation of bigger registers. + Removing use of the target hook does not cause any test failures in common + architectures. If the compiler for a target architecture did want some + form of conversion, including a larger result type, it could always + explicitly used the ``DW_OP_convert`` operation. + + If T is a larger type than the register size, then the default GDB + register hook reads bytes from the next register (or reads out of bounds + for the last register!). Removing use of the target hook does not cause + any test failures in common architectures (except an illegal hand written + assembly test). If a target architecture requires this behavior, these + extensions allow a composite location description to be used to combine + multiple registers. 2. ``DW_OP_deref`` - The ``DW_OP_deref`` operation pops one stack entry that must be a location - description L. - - A value of the bit size of the generic type is retrieved from the location - storage specified by L. The value V retrieved is pushed on the stack with - the generic type. + S is the bit size of the generic type divided by 8 (the byte size) and + rounded up to a whole number. DR is the offset of a hypothetical debug + information entry D in the current compilation unit for a base type of the + generic type. - If any bit of the value is retrieved from the undefined location storage, or - the offset of any bit exceeds the size of the location storage specified by - L, then the DWARF expression is ill-formed. - - See :ref:`amdgpu-dwarf-implicit-location-descriptions` for special rules - concerning implicit location descriptions created by the - ``DW_OP_implicit_pointer`` and ``DW_OP_LLVM_implicit_aspace_pointer`` - operations. - - *If L, or the location description of any composite location description - part that is a subcomponent of L, has more than one single location - description, then any one of them can be selected as they are required to - all have the same value. For any single location description SL, bits are - retrieved from the associated storage location starting at the bit offset - specified by SL. For a composite location description, the retrieved bits - are the concatenation of the N bits from each composite location part PL, - where N is limited to the size of PL.* + The operation is equivalent to performing ``DW_OP_deref_type S, DR``. 3. ``DW_OP_deref_size`` ``DW_OP_deref_size`` has a single 1-byte unsigned integral constant that represents a byte result size S. - It pops one stack entry that must be a location description L. - - T is the smaller of the generic type size and S scaled by 8 (the byte size). - A value V of T bits is retrieved from the location storage specified by L. - If V is smaller than the size of the generic type, V is zero-extended to the - generic type size. V is pushed onto the stack with the generic type. - - The DWARF expression is ill-formed if any bit of the value is retrieved from - the undefined location storage, or if the offset of any bit exceeds the size - of the location storage specified by L. + TS is the smaller of the generic type bit size and S scaled by 8 (the byte + size). If TS is smaller than the generic type bit size then T is an unsigned + integral type of bit size TS, otherwise T is the generic type. DR is the + offset of a hypothetical debug information entry D in the current + compilation unit for a base type T. .. note:: Truncating the value when S is larger than the generic type matches what - GDB does. This allows the generic type size to not be a integral byte + GDB does. This allows the generic type size to not be an integral byte size. It does allow S to be arbitrarily large. Should S be restricted to the size of the generic type rounded up to a multiple of 8? - See :ref:`amdgpu-dwarf-implicit-location-descriptions` for special rules - concerning implicit location descriptions created by the - ``DW_OP_implicit_pointer`` and ``DW_OP_LLVM_implicit_aspace_pointer`` - operations. + The operation is equivalent to performing ``DW_OP_deref_type S, DR``, except + if T is not the generic type, the value V pushed is zero-extended to the + generic type bit size and its type changed to the generic type. 4. ``DW_OP_deref_type`` ``DW_OP_deref_type`` has two operands. The first is a 1-byte unsigned - integral constant S. The second is an unsigned LEB128 integer that - represents the offset of a debugging information entry D in the current - compilation unit, that provides the type of the result value. - - It pops one stack entry that must be a location description L. T is the bit - size of the type D. A value V of T bits is retrieved from the location - storage specified by L. V is pushed on the stack with the type D. + integral constant S. The second is an unsigned LEB128 integer DR that + represents the byte offset of a debugging information entry D relative to + the beginning of the current compilation unit, that provides the type T of + the result value. - The DWARF is ill-formed if D is not a ``DW_TAG_base_type`` debugging - information entry, if T divided by 8 and rounded up to a multiple of 8 (the - byte size) is not equal to S, if any bit of the value is retrieved from the - undefined location storage, or if the offset of any bit exceeds the size of - the location storage specified by L. + TS is the bit size of the type T. - See :ref:`amdgpu-dwarf-implicit-location-descriptions` for special rules - concerning implicit location descriptions created by the - ``DW_OP_implicit_pointer`` and ``DW_OP_LLVM_implicit_aspace_pointer`` - operations. - - *While the size of the pushed value V can be inferred from the type D - definition, it is encoded explicitly into the operation so that the - operation can be parsed easily without reference to the* ``.debug_info`` - *section.* + *While the size of the pushed value V can be inferred from the type T, it is + encoded explicitly as the operand S so that the operation can be parsed + easily without reference to the* ``.debug_info`` *section.* .. note:: It is unclear why the operand S is needed. Unlike ``DW_OP_const_type``, the size is not needed for parsing. Any evaluation needs to get the base - type to record with the value to know its encoding and bit size. + type T to push with the value to know its encoding and bit size. + + It pops one stack entry that must be a location description L. + + A value V of TS bits is retrieved from the location storage LS specified by + one of the single location descriptions SL of L. + + *If L, or the location description of any composite location description + part that is a subcomponent of L, has more than one single location + description, then any one of them can be selected as they are required to + all have the same value. For any single location description SL, bits are + retrieved from the associated storage location starting at the bit offset + specified by SL. For a composite location description, the retrieved bits + are the concatenation of the N bits from each composite location part PL, + where N is limited to the size of PL.* + + V is pushed on the stack with the type T. + + .. note:: + + This definition makes it an evaluation error if L is a register location + description that has less than TS bits remaining in the register storage. + Particularly since these extensions extend location descriptions to have + a bit offset, it would be odd to define this as performing sign extension + based on the type, or be target architecture dependent, as the number of + remaining bits could be any number. This matches the GDB implementation + for ``DW_OP_deref_type``. + + These extensions define ``DW_OP_*breg*`` in terms of + ``DW_OP_regval_type``. ``DW_OP_regval_type`` is defined in terms of + ``DW_OP_regx``, which uses a 0 bit offset, and ``DW_OP_deref_type``. + Therefore, it requires the register size to be greater or equal to the + address size of the address space. This matches the GDB implementation for + ``DW_OP_*breg*``. + + The DWARF is ill-formed if D is not in the current compilation unit, D is + not a ``DW_TAG_base_type`` debugging information entry, or if TS divided by + 8 (the byte size) and rounded up to a whole number is not equal to S. + + .. note:: This definition allows the base type to be a bit size since there seems no reason to restrict it. + It is an evaluation error if any bit of the value is retrieved from the + undefined location storage or the offset of any bit exceeds the size of the + location storage LS specified by any single location description SL of L. + + See :ref:`amdgpu-dwarf-implicit-location-descriptions` for special rules + concerning implicit location descriptions created by the + ``DW_OP_implicit_pointer`` and ``DW_OP_LLVM_implicit_aspace_pointer`` + operations. + 5. ``DW_OP_xderef`` *Deprecated* ``DW_OP_xderef`` pops two stack entries. The first must be an integral type @@ -1277,9 +1564,10 @@ There are these special value operations currently defined: 7. ``DW_OP_xderef_type`` *Deprecated* ``DW_OP_xderef_type`` has two operands. The first is a 1-byte unsigned - integral constant S. The second operand is an unsigned LEB128 - integer R that represents the offset of a debugging information entry D in - the current compilation unit, that provides the type of the result value. + integral constant S. The second operand is an unsigned LEB128 integer DR + that represents the byte offset of a debugging information entry D relative + to the beginning of the current compilation unit, that provides the type T + of the result value. It pops two stack entries. The first must be an integral type value that represents an address A. The second must be an integral type value that @@ -1294,51 +1582,43 @@ There are these special value operations currently defined: 8. ``DW_OP_entry_value`` *Deprecated* - ``DW_OP_entry_value`` pushes the value that the described location held upon - entering the current subprogram. + ``DW_OP_entry_value`` pushes the value of an expression that is evaluated in + the context of the calling frame. + + *It may be used to determine the value of arguments on entry to the current + call frame provided they are not clobbered.* It has two operands. The first is an unsigned LEB128 integer S. The second is a block of bytes, with a length equal S, interpreted as a DWARF operation expression E. - E is evaluated as if it had been evaluated upon entering the current - subprogram with an empty initial stack. + E is evaluated with the current context, except the result kind is + unspecified, the call frame is the one that called the current frame, the + program location is the call site in the calling frame, the object is + unspecified, and the initial stack is empty. The calling frame information + is obtained by virtually unwinding the current call frame using the call + frame information (see :ref:`amdgpu-dwarf-call-frame-information`). - .. note:: - - It is unclear what this means. What is the current program location and - current frame that must be used? Does this require reverse execution so - the register and memory state are as it was on entry to the current - subprogram? + If the result of E is a location description L (see + :ref:`amdgpu-dwarf-register-location-descriptions`), and the last operation + executed by E is a ``DW_OP_reg*`` for register R with a target architecture + specific base type of T, then the contents of the register are retrieved as + if a ``DW_OP_deref_type DR`` operation was performed where DR is the offset + of a hypothetical debug information entry in the current compilation unit + for T. The resulting value V s pushed on the stack. - The DWARF expression is ill-formed if the evaluation of E executes a - ``DW_OP_push_object_address`` operation. + *Using* ``DW_OP_reg*`` *provides a more compact form for the case where the + value was in a register on entry to the subprogram.* - If the result of E is a location description with one register location - description (see :ref:`amdgpu-dwarf-register-location-descriptions`), - ``DW_OP_entry_value`` pushes the value that register had upon entering the - current subprogram. The value entry type is the target architecture register - base type. If the register value is undefined or the register location - description bit offset is not 0, then the DWARF expression is ill-formed. + .. note: - *The register location description provides a more compact form for the case - where the value was in a register on entry to the subprogram.* + It is unclear how this provides a more compact expression, as + ``DW_OP_regval_type`` could be used which is marginally larger. - If the result of E is a value V, ``DW_OP_entry_value`` pushes V on the - stack. + If the result of E is a value V, then V is pushed on the stack. Otherwise, the DWARF expression is ill-formed. - *The values needed to evaluate* ``DW_OP_entry_value`` *could be obtained in - several ways. The consumer could suspend execution on entry to the - subprogram, record values needed by* ``DW_OP_entry_value`` *expressions - within the subprogram, and then continue. When evaluating* - ``DW_OP_entry_value``\ *, the consumer would use these recorded values - rather than the current values. Or, when evaluating* ``DW_OP_entry_value``\ - *, the consumer could virtually unwind using the Call Frame Information - (see* :ref:`amdgpu-dwarf-call-frame-information`\ *) to recover register - values that might have been clobbered since the subprogram entry point.* - *The* ``DW_OP_entry_value`` *operation is deprecated as its main usage is provided by other means. DWARF Version 5 added the* ``DW_TAG_call_site_parameter`` *debugger information entry for call sites @@ -1351,29 +1631,8 @@ There are these special value operations currently defined: .. note:: - It is unclear why this operation is defined this way. How would a consumer - know what values have to be saved on entry to the subprogram? Does it have - to parse every expression of every ``DW_OP_entry_value`` operation to - capture all the possible results needed? Or does it have to implement - reverse execution so it can evaluate the expression in the context of the - entry of the subprogram so it can obtain the entry point register and - memory values? Or does the compiler somehow instruct the consumer how to - create the saved copies of the variables on entry? - - If the expression is simply using existing variables, then it is just a - regular expression and no special operation is needed. If the main purpose - is only to read the entry value of a register using CFI then it would be - better to have an operation that explicitly does just that such as the - proposed ``DW_OP_LLVM_call_frame_entry_reg`` operation. - - GDB only seems to implement ``DW_OP_entry_value`` when E is exactly - ``DW_OP_reg*`` or ``DW_OP_breg*; DW_OP_deref*``. It evaluates E in the - context of the calling subprogram and the calling call site program - location. But the wording suggests that is not the intention. - - Given these issues it is suggested ``DW_OP_entry_value`` is deprecated in - favor of using the new facities that have well defined semantics and - implementations. + GDB only implements ``DW_OP_entry_value`` when E is exactly + ``DW_OP_reg*`` or ``DW_OP_breg*; DW_OP_deref*``. .. _amdgpu-dwarf-location-description-operations: @@ -1395,9 +1654,9 @@ General Location Description Operations It adds the value of B scaled by 8 (the byte size) to the bit offset of each single location description SL of L, and pushes the updated L. - If the updated bit offset of any SL is less than 0 or greater than or equal - to the size of the location storage specified by SL, then the DWARF - expression is ill-formed. + It is an evaluation error if the updated bit offset of any SL is less than 0 + or greater than or equal to the size of the location storage specified by + SL. 2. ``DW_OP_LLVM_offset_uconst`` *New* @@ -1426,40 +1685,53 @@ General Location Description Operations It adds the value of B to the bit offset of each single location description SL of L, and pushes the updated L. - If the updated bit offset of any SL is less than 0 or greater than or equal - to the size of the location storage specified by SL, then the DWARF - expression is ill-formed. + It is an evaluation error if the updated bit offset of any SL is less than 0 + or greater than or equal to the size of the location storage specified by + SL. 4. ``DW_OP_push_object_address`` ``DW_OP_push_object_address`` pushes the location description L of the - object currently being evaluated as part of evaluation of a user presented - expression. + current object. - This object may correspond to an independent variable described by its own - debugging information entry or it may be a component of an array, structure, - or class whose address has been dynamically determined by an earlier step - during user expression evaluation. + *This object may correspond to an independent variable that is part of a + user presented expression that is being evaluated. The object location + description may be determined from the variable's own debugging information + entry or it may be a component of an array, structure, or class whose + address has been dynamically determined by an earlier step during user + expression evaluation.* *This operation provides explicit functionality (especially for arrays involving descriptions) that is analogous to the implicit push of the base location description of a structure prior to evaluation of a ``DW_AT_data_member_location`` to access a data member of a structure.* + .. note:: + + This operation could be removed and the object location description + specified as the initial stack as for ``DW_AT_data_member_location``. + + The only attribute that specifies a current object is + ``DW_AT_data_location`` so the non-normative text seems to overstate how + this is being used. Or are there other attributes that need to state they + pass an object? + 5. ``DW_OP_LLVM_call_frame_entry_reg`` *New* ``DW_OP_LLVM_call_frame_entry_reg`` has a single unsigned LEB128 integer operand that represents a target architecture register number R. It pushes a location description L that holds the value of register R on - entry to the current subprogram as defined by the Call Frame Information + entry to the current subprogram as defined by the call frame information (see :ref:`amdgpu-dwarf-call-frame-information`). - *If there is no Call Frame Information defined, then the default rules for + *If there is no call frame information defined, then the default rules for the target architecture are used. If the register rule is* undefined\ *, then the undefined location description is pushed. If the register rule is* same value\ *, then a register location description for R is pushed.* +.. _amdgpu-dwarf-undefined-location-description-operations: + Undefined Location Description Operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1616,8 +1888,19 @@ type. least significant S bits as a twos-complement unsigned value A'. It pushes a location description L with one memory location description SL - on the stack. SL specifies the memory location storage that corresponds to - AS with a bit offset equal to A' scaled by 8 (the byte size). + on the stack. SL specifies the memory location storage LS that corresponds + to AS with a bit offset equal to A' scaled by 8 (the byte size). + + If AS is an address space that is specific to context elements, then LS + corresponds to the location storage associated with the current context. + + *For example, if AS is for per thread storage then LS is the location + storage for the current thread. For languages that are implemented using a + SIMD or SIMT execution model, then if AS is for per lane storage then LS is + the location storage for the current lane of the current thread. Therefore, + if L is accessed by an operation, the location storage selected when the + location description was created is accessed, and not the location storage + associated with the current context of the access operation.* The DWARF expression is ill-formed if AS is not one of the values defined by the target architecture specific ``DW_ASPACE_*`` values. @@ -1630,16 +1913,17 @@ type. 4. ``DW_OP_form_tls_address`` ``DW_OP_form_tls_address`` pops one stack entry that must be an integral - type value and treats it as a thread-local storage address T. + type value and treats it as a thread-local storage address TA. It pushes a location description L with one memory location description SL on the stack. SL is the target architecture specific memory location - description that corresponds to the thread-local storage address T. + description that corresponds to the thread-local storage address TA. - The meaning of the thread-local storage address T is defined by the run-time - environment. If the run-time environment supports multiple thread-local - storage blocks for a single thread, then the block corresponding to the - executable or shared library containing this DWARF expression is used. + The meaning of the thread-local storage address TA is defined by the + run-time environment. If the run-time environment supports multiple + thread-local storage blocks for a single thread, then the block + corresponding to the executable or shared library containing this DWARF + expression is used. *Some implementations of C, C++, Fortran, and other languages support a thread-local storage class. Variables with this storage class have distinct @@ -1661,15 +1945,15 @@ type. 5. ``DW_OP_call_frame_cfa`` ``DW_OP_call_frame_cfa`` pushes the location description L of the Canonical - Frame Address (CFA) of the current subprogram, obtained from the Call Frame - Information on the stack. See :ref:`amdgpu-dwarf-call-frame-information`. + Frame Address (CFA) of the current subprogram, obtained from the call frame + information on the stack. See :ref:`amdgpu-dwarf-call-frame-information`. *Although the value of the* ``DW_AT_frame_base`` *attribute of the debugger information entry corresponding to the current subprogram can be computed using a location list expression, in some cases this would require an extensive location list because the values of the registers used in - computing the CFA change during a subprogram execution. If the Call Frame - Information is present, then it already encodes such changes, and it is + computing the CFA change during a subprogram execution. If the call frame + information is present, then it already encodes such changes, and it is space efficient to reference that using the* ``DW_OP_call_frame_cfa`` *operation.* @@ -1701,10 +1985,12 @@ type. The address size S is defined as the address bit size of the target architecture specific address space corresponding to AS. - The contents of the register specified by R are retrieved as a - twos-complement unsigned value and zero extended to S bits. B is added and - the least significant S bits are treated as a twos-complement unsigned value - to be used as an address A. + The contents of the register specified by R are retrieved as if a + ``DW_OP_regval_type R, DR`` operation was performed where DR is the offset + of a hypothetical debug information entry in the current compilation unit + for an unsigned integral base type of size S bits. B is added and the least + significant S bits are treated as an unsigned value to be used as an address + A. They push a location description L comprising one memory location description LS on the stack. LS specifies the memory location storage that @@ -1716,8 +2002,8 @@ type. that represents a register number R. The second is a signed LEB128 integer that represents a byte displacement B. - The action is the same as for ``DW_OP_breg`` except that R is used as the - register number and B is used as the byte displacement. + The action is the same as for ``DW_OP_breg``, except that R is used as + the register number and B is used as the byte displacement. 9. ``DW_OP_LLVM_aspace_bregx`` *New* @@ -1727,9 +2013,9 @@ type. entry that is required to be an integral type value that represents a target architecture specific address space identifier AS. - The action is the same as for ``DW_OP_breg`` except that R is used as the - register number, B is used as the byte displacement, and AS is used as the - address space identifier. + The action is the same as for ``DW_OP_breg``, except that R is used as + the register number, B is used as the byte displacement, and AS is used as + the address space identifier. The DWARF expression is ill-formed if AS is not one of the values defined by the target architecture specific ``DW_ASPACE_*`` values. @@ -1759,18 +2045,32 @@ register starting at the specified bit offset. numbered from 0 through 31, inclusive. The target architecture register number R corresponds to the N in the operation name. - They push a location description L that specifies one register location - description SL on the stack. SL specifies the register location storage that - corresponds to R with a bit offset of 0. + The operation is equivalent to performing ``DW_OP_regx R``. 2. ``DW_OP_regx`` ``DW_OP_regx`` has a single unsigned LEB128 integer operand that represents a target architecture register number R. - It pushes a location description L that specifies one register location - description SL on the stack. SL specifies the register location storage that - corresponds to R with a bit offset of 0. + If the current call frame is the top call frame, it pushes a location + description L that specifies one register location description SL on the + stack. SL specifies the register location storage that corresponds to R with + a bit offset of 0 for the current thread. + + If the current call frame is not the top call frame, call frame information + (see :ref:`amdgpu-dwarf-call-frame-information`) is used to determine the + location description that holds the register for the current call frame and + current program location of the current thread. The resulting location + description L is pushed. + + *Note that if call frame information is used, the resulting location + description may be register, memory, or undefined.* + + *An implementation may evaluate the call frame information immediately, or + may defer evaluation until L is accessed by an operation. If evaluation is + defered, R and the current context can be recorded in L. When accessed, the + recorded context is used to evaluate the call frame information, not the + current context of the access operation.* *These operations obtain a register location. To fetch the contents of a register, it is necessary to use* ``DW_OP_regval_type``\ *, use one of the* @@ -1844,16 +2144,14 @@ implicit storage value starting at the bit offset. to access the dereferenced pointer, even when it cannot access the pointer itself.* - ``DW_OP_implicit_pointer`` has two operands. The first is a 4-byte unsigned - value in the 32-bit DWARF format, or an 8-byte unsigned value in the 64-bit - DWARF format, that represents a debugging information entry reference R. The - second is a signed LEB128 integer that represents a byte displacement B. + ``DW_OP_implicit_pointer`` has two operands. The first operand is a 4-byte + unsigned value in the 32-bit DWARF format, or an 8-byte unsigned value in + the 64-bit DWARF format, that represents the byte offset DR of a debugging + information entry D relative to the beginning of the ``.debug_info`` section + that contains the current compilation unit. The second operand is a signed + LEB128 integer that represents a byte displacement B. - R is used as the offset of a debugging information entry D in a - ``.debug_info`` section, which may be contained in an executable or shared - object file other than that containing the operation. For references from one - executable or shared object file to another, the relocation must be - performed by the consumer. + *Note that D may not be in the current compilation unit.* *The first operand interpretation is exactly like that for* ``DW_FORM_ref_addr``\ *.* @@ -1870,8 +2168,10 @@ implicit storage value starting at the bit offset. It pushes a location description L that comprises one implicit location description SL on the stack. SL specifies LS with a bit offset of 0. - If a ``DW_OP_deref*`` operation pops a location description L', and - retrieves S bits where both: + It is an evaluation error if a ``DW_OP_deref*`` operation pops a location + description L', and retrieves S bits, such that any retrieved bits come from + an implicit location storage that is the same as LS, unless both the + following conditions are met: 1. All retrieved bits come from an implicit location description that refers to an implicit location storage that is the same as LS. @@ -1884,26 +2184,29 @@ implicit storage value starting at the bit offset. *These rules are equivalent to retrieving the complete contents of LS.* - Then the value V pushed by the ``DW_OP_deref*`` operation is an implicit - pointer value IPV with a target architecture specific address space of AS, a - debugging information entry of D, and a base type of T. If AS is the target - architecture default address space, then T is the generic type. Otherwise, T - is a target architecture specific integral type with a bit size equal to S. - - Otherwise, if a ``DW_OP_deref*`` operation is applied to a location - description such that some retrieved bits come from an implicit location - storage that is the same as LS, then the DWARF expression is ill-formed. + If both the above conditions are met, then the value V pushed by the + ``DW_OP_deref*`` operation is an implicit pointer value IPV with a target + architecture specific address space of AS, a debugging information entry of + D, and a base type of T. If AS is the target architecture default address + space, then T is the generic type. Otherwise, T is a target architecture + specific integral type with a bit size equal to S. If IPV is either implicitly converted to a location description (only done if AS is the target architecture default address space) or used by - ``DW_OP_LLVM_form_aspace_address`` (only done if the address space specified - is AS), then the resulting location description RL is: + ``DW_OP_LLVM_form_aspace_address`` (only done if the address space popped by + ``DW_OP_LLVM_form_aspace_address`` is AS), then the resulting location + description RL is: * If D has a ``DW_AT_location`` attribute, the DWARF expression E from the - ``DW_AT_location`` attribute is evaluated as a location description. The - current subprogram and current program location of the evaluation context - that is accessing IPV is used for the evaluation context of E, together - with an empty initial stack. RL is the expression result. + ``DW_AT_location`` attribute is evaluated with the current context, except + that the result kind is a location description, the compilation unit is + the one that contains D, the object is unspecified, and the initial stack + is empty. RL is the expression result. + + *Note that E is evaluated with the context of the expression accessing + IPV, and not the context of the expression that contained the* + ``DW_OP_implicit_pointer`` *or* ``DW_OP_LLVM_aspace_implicit_pointer`` + *operation that created L.* * If D has a ``DW_AT_const_value`` attribute, then an implicit location storage RLS is created from the ``DW_AT_const_value`` attribute's value @@ -1917,7 +2220,7 @@ implicit storage value starting at the bit offset. deprecated and instead ``DW_AT_location`` is used with an implicit location description, then this rule would not be required. - * Otherwise the DWARF expression is ill-formed. + * Otherwise, it is an evaluation error. The bit offset of RL is updated as if the ``DW_OP_LLVM_offset_uconst B`` operation was applied. @@ -1925,8 +2228,7 @@ implicit storage value starting at the bit offset. If a ``DW_OP_stack_value`` operation pops a value that is the same as IPV, then it pushes a location description that is the same as L. - The DWARF expression is ill-formed if it accesses LS or IPV in any other - manner. + It is an evaluation error if LS or IPV is accessed in any other manner. *The restrictions on how an implicit pointer location description created by* ``DW_OP_implicit_pointer`` *and* ``DW_OP_LLVM_aspace_implicit_pointer`` @@ -1942,12 +2244,18 @@ implicit storage value starting at the bit offset. a target architecture specific address space identifier AS. The location description L that is pushed on the stack is the same as for - ``DW_OP_implicit_pointer`` except that the address space identifier used is + ``DW_OP_implicit_pointer``, except that the address space identifier used is AS. The DWARF expression is ill-formed if AS is not one of the values defined by the target architecture specific ``DW_ASPACE_*`` values. + .. note:: + + This definition of ``DW_OP_LLVM_aspace_implicit_pointer`` may change when + full support for address classes is added as required for languages such + as OpenCL/SyCL. + *Typically a* ``DW_OP_implicit_pointer`` *or* ``DW_OP_LLVM_aspace_implicit_pointer`` *operation is used in a DWARF expression E*\ :sub:`1` *of a* ``DW_TAG_variable`` *or* ``DW_TAG_formal_parameter`` @@ -2096,7 +2404,7 @@ compatible with the definitions in DWARF Version 5.* .. note:: - Since this proposal allows location descriptions to be entries on the + Since these extensions allow location descriptions to be entries on the stack, a simpler operation to create composite location descriptions. For example, just one operation that specifies how many parts, and pops pairs of stack entries for the part size and location description. Not only @@ -2111,7 +2419,7 @@ compatible with the definitions in DWARF Version 5.* integer that represents the part bit size S. The second is an unsigned LEB128 integer that represents a bit displacement B. - The action is the same as for ``DW_OP_piece`` except that any part created + The action is the same as for ``DW_OP_piece``, except that any part created has the bit size S, and the location description PL of any created part is updated as if the ``DW_OP_constu B; DW_OP_LLVM_bit_offset`` operations were applied. @@ -2252,17 +2560,31 @@ defined, and there is no following default location description entry, it is assumed that the object is not available for the portion of the range that is not covered. -The operation expression of each matching location list entry is evaluated as a -location description and its result is returned as the result of the location -list entry. The operation expression is evaluated with the same context as the -location list expression, including the same current frame, current program -location, and initial stack. +The result of the evaluation of a DWARF location list expression is: + +* If the current program location is not specified, then it is an evaluation + error. + + .. note:: + + If the location list only has a single default entry, should that be + considered a match if there is no program location? If there are non-default + entries then it seems it has to be an evaluation error when there is no + program location as that indicates the location depends on the program + location which is not known. -The result of the evaluation of a DWARF location list expression is a location -description that is comprised of the union of the single location descriptions -of the location description result of each matching location list entry. If -there are no matching location list entries, then the result is a location -description that comprises one undefined location description. +* If there are no matching location list entries, then the result is a location + description that comprises one undefined location description. + +* Otherwise, the operation expression E of each matching location list entry is + evaluated with the current context, except that the result kind is a location + description, the object is unspecified, and the initial stack is empty. The + location list entry result is the location description returned by the + evaluation of E. + + The result is a location description that is comprised of the union of the + single location descriptions of the location description result of each + matching location list entry. A location list expression can only be used as the value of a debugger information entry attribute that is encoded using class ``loclist`` or @@ -2365,15 +2687,15 @@ DWARF address space identifiers are used by: .. note:: - With the definition of DWARF address classes and DWARF address spaces in this - proposal, DWARF Version 5 table 2.7 needs to be updated. It seems it is an + With the definition of DWARF address classes and DWARF address spaces in these + extensions, DWARF Version 5 table 2.7 needs to be updated. It seems it is an example of DWARF address spaces and not DWARF address classes. .. note:: - With the expanded support for DWARF address spaces in this proposal, it may be - worth examining if DWARF segments can be eliminated and DWARF address spaces - used instead. + With the expanded support for DWARF address spaces in these extensions, it may + be worth examining if DWARF segments can be eliminated and DWARF address + spaces used instead. That may involve extending DWARF address spaces to also be used to specify code locations. In target architectures that use different memory areas for @@ -2423,7 +2745,7 @@ DWARF address space identifiers are used by: debugger information entry type modifier that can be applied to a pointer type and reference type. The ``DW_AT_address_class`` attribute could be re-defined to not be target architecture specific and instead define generalized language - values (as is proposed above for DWARF address classes in the table + values (as presented above for DWARF address classes in the table :ref:`amdgpu-dwarf-address-class-table`) that will support OpenCL and other languages using memory spaces. The ``DW_AT_address_class`` attribute could be defined to not be applied to pointer types or reference types, but instead @@ -2444,7 +2766,7 @@ DWARF address space identifiers are used by: variable allocation in address classes. Such variable allocation would result in the variable's location description needing an address space. - The approach proposed in :ref:`amdgpu-dwarf-address-class-table` is to define + The approach presented in :ref:`amdgpu-dwarf-address-class-table` is to define the default ``DW_ADDR_none`` to be the generic address class and not the global address class. This matches how CLANG and LLVM have added support for CUDA-like languages on top of existing C++ language support. This allows all @@ -2471,6 +2793,26 @@ DWARF address space identifiers are used by: ``DW_AT_address_space`` would be target architecture specific and the same as used in ``DW_OP_xderef*``. +.. note:: + + Some additional changes will be made to support languages such as OpenCL/SyCL + that allow address class pointer casting and queries. + + This requires the compiler to provide the mapping from address space to + address class which may be runtime and not target architecture dependent. Some + implementations may have a one-to-one mapping from source language address + class to target architecture address space, and some may have a many-to-one + mapping which requires knowledge of the address class when determining if + pointer address class casts are allowed. + + The changes will likely add an attribute that has an expression provided by + the compiler to map from address class to address space. The + ``DW_OP_implicit_pointer`` and ``DW_OP_LLVM_aspace_implicit_pointer`` + operations may be changed as the current IPV definition may not provide enough + information when used to cast between address classes. Other attributes and + operations may be needed. The legal casts between address classes may need to + be defined on a per language address class basis. + .. _amdgpu-dwarf-debugging-information-entry-attributes: Debugging Information Entry Attributes @@ -2479,7 +2821,7 @@ Debugging Information Entry Attributes .. note:: This section provides changes to existing debugger information entry - attributes and defines attributes added by the proposal. These would be + attributes and defines attributes added by these extensions. These would be incorporated into the appropriate DWARF Version 5 chapter 2 sections. 1. ``DW_AT_location`` @@ -2488,10 +2830,12 @@ Debugging Information Entry Attributes variables and parameters) or common blocks may have a ``DW_AT_location`` attribute, whose value is a DWARF expression E. - The result of the attribute is obtained by evaluating E as a location - description in the context of the current subprogram, current program - location, and with an empty initial stack. See - :ref:`amdgpu-dwarf-expressions`. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a location description, an unspecified object, the + compilation unit that contains E, an empty initial stack, and other context + elements corresponding to the source language thread of execution upon which + the user is focused, if any. The result of the evaluation is the location + description of the base of the data object. See :ref:`amdgpu-dwarf-control-flow-operations` for special evaluation rules used by the ``DW_OP_call*`` operations. @@ -2535,9 +2879,11 @@ Debugging Information Entry Attributes may have a ``DW_AT_frame_base`` attribute, whose value is a DWARF expression E. - The result of the attribute is obtained by evaluating E as a location - description in the context of the current subprogram, current program - location, and with an empty initial stack. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a location description, an unspecified object, the + compilation unit that contains E, an empty initial stack, and other context + elements corresponding to the source language thread of execution upon which + the user is focused, if any. The DWARF is ill-formed if E contains an ``DW_OP_fbreg`` operation, or the resulting location description L is not comprised of one single location @@ -2584,10 +2930,13 @@ Debugging Information Entry Attributes byte.* 2. Otherwise, the attribute must be a DWARF expression E which is evaluated - with a context of the current frame, current program location, and an - initial stack comprising the location description of the beginning of - the containing entity. The result of the evaluation is the location - description of the base of the member entry. + with a context that has a result kind of a location description, an + unspecified object, the compilation unit that contains E, an initial + stack comprising the location description of the beginning of the + containing entity, and other context elements corresponding to the + source language thread of execution upon which the user is focused, if + any. The result of the evaluation is the location description of the + base of the member entry. .. note:: @@ -2613,12 +2962,16 @@ Debugging Information Entry Attributes location description for a particular object of the given pointer to member type and for a particular structure or class instance. - The result of the attribute is obtained by evaluating E as a location - description with the context of the current subprogram, current program - location, and an initial stack comprising two entries. The first entry is - the value of the pointer to member object itself. The second entry is the - location description of the base of the entire class, structure, or union - instance containing the member whose location is being calculated. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a location description, an unspecified object, the + compilation unit that contains E, an initial stack comprising two entries, + and other context elements corresponding to the source language thread of + execution upon which the user is focused, if any. The first stack entry is + the value of the pointer to member object itself. The second stack entry is + the location description of the base of the entire class, structure, or + union instance containing the member whose location is being calculated. The + result of the evaluation is the location description of the member of the + class to which the pointer to member entry points. 6. ``DW_AT_data_location`` @@ -2629,9 +2982,13 @@ Debugging Information Entry Attributes attribute is omitted, the location description of the data is the same as the location description of the object. - The result of the attribute is obtained by evaluating E as a location - description with the context of the current subprogram, current program - location, and an empty initial stack. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a location description, an object that is the location + description of the data descriptor, the compilation unit that contains E, an + empty initial stack, and other context elements corresponding to the source + language thread of execution upon which the user is focused, if any. The + result of the evaluation is the location description of the base of the + member entry. *E will typically involve an operation expression that begins with a* ``DW_OP_push_object_address`` *operation which loads the location @@ -2657,13 +3014,14 @@ Debugging Information Entry Attributes An entry for a virtual function also has a ``DW_AT_vtable_elem_location`` attribute whose value is a DWARF expression E. - The result of the attribute is obtained by evaluating E as a location - description with the context of the current subprogram, current program - location, and an initial stack comprising the location description of the - object of the enclosing type. - - The resulting location description is the slot for the function within the - virtual function table for the enclosing class. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a location description, an unspecified object, the + compilation unit that contains E, an initial stack comprising the location + description of the object of the enclosing type, and other context elements + corresponding to the source language thread of execution upon which the user + is focused, if any. The result of the evaluation is the location description + of the slot for the function within the virtual function table for the + enclosing class. 8. ``DW_AT_static_link`` @@ -2671,17 +3029,18 @@ Debugging Information Entry Attributes entry is lexically nested, it may have a ``DW_AT_static_link`` attribute, whose value is a DWARF expression E. - The result of the attribute is obtained by evaluating E as a location - description with the context of the current subprogram, current program - location, and an empty initial stack. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a location description, an unspecified object, the + compilation unit that contains E, an empty initial stack, and other context + elements corresponding to the source language thread of execution upon which + the user is focused, if any. The result of the evaluation is the location + description L of the *canonical frame address* (see + :ref:`amdgpu-dwarf-call-frame-information`) of the relevant call frame of + the subprogram instance that immediately lexically encloses the current call + frame's subprogram or entry point. - The DWARF is ill-formed if the resulting location description L is is not - comprised of one memory location description in any of the target - architecture specific address spaces. - - The resulting L is the *frame base* of the relevant instance of the - subprogram that immediately lexically encloses the subprogram or entry - point. + The DWARF is ill-formed if L is is not comprised of one memory location + description for one of the target architecture specific address spaces. 9. ``DW_AT_return_addr`` @@ -2689,45 +3048,39 @@ Debugging Information Entry Attributes ``DW_TAG_entry_point`` debugger information entry may have a ``DW_AT_return_addr`` attribute, whose value is a DWARF expression E. - The result of the attribute is obtained by evaluating E as a location - description with the context of the current subprogram, current program - location, and an empty initial stack. - - The DWARF is ill-formed if the resulting location description L is not - comprised one memory location description in any of the target architecture - specific address spaces. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a location description, an unspecified object, the + compilation unit that contains E, an empty initial stack, and other context + elements corresponding to the source language thread of execution upon which + the user is focused, if any. The result of the evaluation is the location + description L of the place where the return address for the current call + frame's subprogram or entry point is stored. - The resulting L is the place where the return address for the subprogram or - entry point is stored. + The DWARF is ill-formed if L is not comprised of one memory location + description for one of the target architecture specific address spaces. .. note:: It is unclear why ``DW_TAG_inlined_subroutine`` has a ``DW_AT_return_addr`` attribute but not a ``DW_AT_frame_base`` or ``DW_AT_static_link`` attribute. Seems it would either have all of them or - none. Since inlined subprograms do not have a frame it seems they would - have none of these attributes. + none. Since inlined subprograms do not have a call frame it seems they + would have none of these attributes. -10. ``DW_AT_call_value``, ``DW_AT_call_data_location``, and ``DW_AT_call_data_value`` +10. ``DW_AT_call_value``, ``DW_AT_call_data_location``, and + ``DW_AT_call_data_value`` A ``DW_TAG_call_site_parameter`` debugger information entry may have a ``DW_AT_call_value`` attribute, whose value is a DWARF operation expression E\ :sub:`1`\ . The result of the ``DW_AT_call_value`` attribute is obtained by evaluating - E\ :sub:`1` as a value with the context of the call site subprogram, call - site program location, and an empty initial stack. - - The call site subprogram is the subprogram containing the - ``DW_TAG_call_site_parameter`` debugger information entry. The call site - program location is the location of call site in the call site subprogram. - - *The consumer may have to virtually unwind to the call site in order to - evaluate the attribute. This will provide both the call site subprogram and - call site program location needed to evaluate the expression.* - - The resulting value V\ :sub:`1` is the value of the parameter at the time of - the call made by the call site. + E\ :sub:`1` with a context that has a result kind of a value, an unspecified + object, the compilation unit that contains E, an empty initial stack, and + other context elements corresponding to the source language thread of + execution upon which the user is focused, if any. The resulting value V\ + :sub:`1` is the value of the parameter at the time of the call made by the + call site. For parameters passed by reference, where the code passes a pointer to a location which contains the parameter, or for reference type parameters, the @@ -2737,21 +3090,34 @@ Debugging Information Entry Attributes value is a DWARF operation expression E\ :sub:`3`\ . The value of the ``DW_AT_call_data_location`` attribute is obtained by - evaluating E\ :sub:`2` as a location description with the context of the - call site subprogram, call site program location, and an empty initial - stack. - - The resulting location description L\ :sub:`2` is the location where the + evaluating E\ :sub:`2` with a context that has a result kind of a location + description, an unspecified object, the compilation unit that contains E, an + empty initial stack, and other context elements corresponding to the source + language thread of execution upon which the user is focused, if any. The + resulting location description L\ :sub:`2` is the location where the referenced parameter lives during the call made by the call site. If E\ :sub:`2` would just be a ``DW_OP_push_object_address``, then the ``DW_AT_call_data_location`` attribute may be omitted. The value of the ``DW_AT_call_data_value`` attribute is obtained by - evaluating E\ :sub:`3` as a value with the context of the call site - subprogram, call site program location, and an empty initial stack. - - The resulting value V\ :sub:`3` is the value in L\ :sub:`2` at the time of - the call made by the call site. + evaluating E\ :sub:`3` with a context that has a result kind of a value, an + unspecified object, the compilation unit that contains E, an empty initial + stack, and other context elements corresponding to the source language + thread of execution upon which the user is focused, if any. The resulting + value V\ :sub:`3` is the value in L\ :sub:`2` at the time of the call made + by the call site. + + The result of these attributes is undefined if the current call frame is + not for the subprogram containing the ``DW_TAG_call_site_parameter`` + debugger information entry or the current program location is not for the + call site containing the ``DW_TAG_call_site_parameter`` debugger information + entry in the current call frame. + + *The consumer may have to virtually unwind to the call site (see* + :ref:`amdgpu-dwarf-call-frame-information`\ *) in order to evaluate these + attributes. This will ensure the source language thread of execution upon + which the user is focused corresponds to the call site needed to evaluate + the expression.* If it is not possible to avoid the expressions of these attributes from accessing registers or memory locations that might be clobbered by the @@ -2787,9 +3153,11 @@ Debugging Information Entry Attributes ``DW_TAG_entry_point`` debugging information entry may have a ``DW_AT_LLVM_lane_pc`` attribute whose value is a DWARF expression E. - The result of the attribute is obtained by evaluating E as a location - description with the context of the current subprogram, current program - location, and an empty initial stack. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a location description, an unspecified object, the + compilation unit that contains E, an empty initial stack, and other context + elements corresponding to the source language thread of execution upon which + the user is focused, if any. The resulting location description L is for a thread lane count sized vector of generic type elements. The thread lane count is the value of the @@ -2817,9 +3185,11 @@ Debugging Information Entry Attributes ``DW_TAG_entry_point`` debugger information entry may have a ``DW_AT_LLVM_active_lane`` attribute whose value is a DWARF expression E. - The result of the attribute is obtained by evaluating E as a value with the - context of the current subprogram, current program location, and an empty - initial stack. + The result of the attribute is obtained by evaluating E with a context that + has a result kind of a value, an unspecified object, the compilation unit + that contains E, an empty initial stack, and other context elements + corresponding to the source language thread of execution upon which the user + is focused, if any. The DWARF is ill-formed if the resulting value V is not an integral value. @@ -3042,14 +3412,16 @@ Call Frame Information .. note:: - This section provides changes to existing Call Frame Information and defines - instructions added by the proposal. Additional support is added for address - spaces. Register unwind DWARF expressions are generalized to allow any + This section provides changes to existing call frame information and defines + instructions added by these extensions. Additional support is added for + address spaces. Register unwind DWARF expressions are generalized to allow any location description, including those with composite and implicit location descriptions. These changes would be incorporated into the DWARF Version 5 section 6.1. +.. _amdgpu-dwarf-structure_of-call-frame-information: + Structure of Call Frame Information +++++++++++++++++++++++++++++++++++ @@ -3057,24 +3429,43 @@ The register rules are: *undefined* A register that has this rule has no recoverable value in the previous frame. - (By convention, it is not preserved by a callee.) + The previous value of this register is the undefined location description (see + :ref:`amdgpu-dwarf-undefined-location-description-operations`). + + *By convention, the register is not preserved by a callee.* *same value* - This register has not been modified from the previous frame. (By convention, - it is preserved by the callee, but the callee has not modified it.) + This register has not been modified from the previous caller frame. + + If the current frame is the top frame, then the previous value of this + register is the location description L that specifies one register location + description SL. SL specifies the register location storage that corresponds to + the register with a bit offset of 0 for the current thread. + + If the current frame is not the top frame, then the previous value of this + register is the location description obtained using the call frame information + for the callee frame and callee program location invoked by the current caller + frame for the same register. + + *By convention, the register is preserved by the callee, but the callee has + not modified it.* *offset(N)* N is a signed byte offset. The previous value of this register is saved at the location description computed as if the DWARF operation expression - ``DW_OP_LLVM_offset N`` is evaluated as a location description with an initial - stack comprising the location description of the current CFA (see + ``DW_OP_LLVM_offset N`` is evaluated with the current context, except the + result kind is a location description, the compilation unit is unspecified, + the object is unspecified, and an initial stack comprising the location + description of the current CFA (see :ref:`amdgpu-dwarf-operation-expressions`). *val_offset(N)* N is a signed byte offset. The previous value of this register is the memory byte address of the location description computed as if the DWARF operation - expression ``DW_OP_LLVM_offset N`` is evaluated as a location description with - an initial stack comprising the location description of the current CFA (see + expression ``DW_OP_LLVM_offset N`` is evaluated with the current context, + except the result kind is a location description, the compilation unit is + unspecified, the object is unspecified, and an initial stack comprising the + location description of the current CFA (see :ref:`amdgpu-dwarf-operation-expressions`). The DWARF is ill-formed if the CFA location description is not a memory byte @@ -3103,26 +3494,40 @@ The register rules are: test). *register(R)* - The previous value of this register is stored in another register numbered R. + This register has been stored in another register numbered R. - The DWARF is ill-formed if the register sizes do not match. + The previous value of this register is the location description obtained using + the call frame information for the current frame and current program location + for register R. + + The DWARF is ill-formed if the size of this register does not match the size + of register R or if there is a cyclic dependency in the call frame + information. + + .. note:: + + Should this also allow R to be larger than this register? If so is the value + stored in the low order bits and it is undefined what is stored in the + extra upper bits? *expression(E)* The previous value of this register is located at the location description produced by evaluating the DWARF operation expression E (see :ref:`amdgpu-dwarf-operation-expressions`). - E is evaluated as a location description in the context of the current - subprogram, current program location, and with an initial stack comprising the - location description of the current CFA. + E is evaluated with the current context, except the result kind is a location + description, the compilation unit is unspecified, the object is unspecified, + and an initial stack comprising the location description of the current CFA + (see :ref:`amdgpu-dwarf-operation-expressions`). *val_expression(E)* The previous value of this register is the value produced by evaluating the DWARF operation expression E (see :ref:`amdgpu-dwarf-operation-expressions`). - E is evaluated as a value in the context of the current subprogram, current - program location, and with an initial stack comprising the location - description of the current CFA. + E is evaluated with the current context, except the result kind is a value, + the compilation unit is unspecified, the object is unspecified, and an initial + stack comprising the location description of the current CFA (see + :ref:`amdgpu-dwarf-operation-expressions`). The DWARF is ill-formed if the resulting value type size does not match the register size. @@ -3139,8 +3544,8 @@ The register rules are: *architectural* The rule is defined externally to this specification by the augmenter. -A Common Information Entry holds information that is shared among many Frame -Description Entries. There is at least one CIE in every non-empty +A Common Information Entry (CIE) holds information that is shared among many +Frame Description Entries (FDE). There is at least one CIE in every non-empty ``.debug_frame`` section. A CIE contains the following fields, in order: 1. ``length`` (initial length) @@ -3167,7 +3572,7 @@ Description Entries. There is at least one CIE in every non-empty .. note:: - Would this be increased to 5 to reflect the changes in the proposal? + Would this be increased to 5 to reflect the changes in these extensions? 4. ``augmentation`` (sequence of UTF-8 characters) @@ -3236,6 +3641,10 @@ Description Entries. There is at least one CIE in every non-empty represents the return address of the subprogram. Note that this column might not correspond to an actual machine register. + The value of the return address register is used to determine the program + location of the caller frame. The program location of the top frame is the + target architecture program counter value of the current thread. + 10. ``initial_instructions`` (array of ubyte) A sequence of rules that are interpreted to create the initial setting of @@ -3356,7 +3765,7 @@ CFA Definition Instructions result of evaluating the DWARF operation expression ``DW_OP_constu AS; DW_OP_aspace_bregx R, B*data_alignment_factor`` as a location description. - *The action is the same as* ``DW_CFA_def_cfa`` *except that the second + *The action is the same as* ``DW_CFA_def_cfa``\ *, except that the second operand is signed and factored.* 3. ``DW_CFA_def_aspace_cfa`` *New* @@ -3384,7 +3793,7 @@ CFA Definition Instructions If AS is not one of the values defined by the target architecture specific ``DW_ASPACE_*`` values, then the DWARF expression is ill-formed. - *The action is the same as* ``DW_CFA_aspace_def_cfa`` *except that the + *The action is the same as* ``DW_CFA_aspace_def_cfa``\ *, except that the second operand is signed and factored.* 5. ``DW_CFA_def_cfa_register`` @@ -3423,7 +3832,7 @@ CFA Definition Instructions If the subprogram has no current CFA rule, or the rule was defined by a ``DW_CFA_def_cfa_expression`` instruction, then the DWARF is ill-formed. - *The action is the same as* ``DW_CFA_def_cfa_offset`` *except that the + *The action is the same as* ``DW_CFA_def_cfa_offset``\ *, except that the operand is signed and factored.* 8. ``DW_CFA_def_cfa_expression`` @@ -3431,8 +3840,9 @@ CFA Definition Instructions The ``DW_CFA_def_cfa_expression`` instruction takes a single operand encoded as a ``DW_FORM_exprloc`` value representing a DWARF operation expression E. The required action is to define the current CFA rule to be the result of - evaluating E as a location description in the context of the current - subprogram, current program location, and an empty initial stack. + evaluating E with the current context, except the result kind is a location + description, the compilation unit is unspecified, the object is unspecified, + and an empty initial stack. *See* :ref:`amdgpu-dwarf-call-frame-instructions` *regarding restrictions on the DWARF expression operations that can be used in E.* @@ -3473,7 +3883,7 @@ Register Rule Instructions The ``DW_CFA_offset_extended`` instruction takes two unsigned LEB128 operands representing a register number R and a factored displacement B. - This instruction is identical to ``DW_CFA_offset`` except for the encoding + This instruction is identical to ``DW_CFA_offset``, except for the encoding and size of the register operand. .. note:: @@ -3486,7 +3896,7 @@ Register Rule Instructions The ``DW_CFA_offset_extended_sf`` instruction takes two operands: an unsigned LEB128 value representing a register number R and a signed LEB128 factored displacement B. This instruction is identical to - ``DW_CFA_offset_extended`` except that B is signed. + ``DW_CFA_offset_extended``, except that B is signed. 6. ``DW_CFA_val_offset`` @@ -3510,7 +3920,7 @@ Register Rule Instructions The ``DW_CFA_val_offset_sf`` instruction takes two operands: an unsigned LEB128 value representing a register number R and a signed LEB128 factored - displacement B. This instruction is identical to ``DW_CFA_val_offset`` + displacement B. This instruction is identical to ``DW_CFA_val_offset``, except that B is signed. 8. ``DW_CFA_register`` @@ -3560,7 +3970,7 @@ Register Rule Instructions The ``DW_CFA_restore_extended`` instruction takes a single unsigned LEB128 operand that represents a register number R. This instruction is identical - to ``DW_CFA_restore`` except for the encoding and size of the register + to ``DW_CFA_restore``, except for the encoding and size of the register operand. Row State Instructions @@ -3692,7 +4102,7 @@ operations. DW_OP_LLVM_undefined 0xe7 0 DW_OP_LLVM_aspace_bregx 0xe8 2 ULEB128 register number, ULEB128 byte displacement - DW_OP_LLVM_aspace_implicit_pointer 0xe9 2 4- or 8-byte offset of DIE, + DW_OP_LLVM_aspace_implicit_pointer 0xe9 2 4-byte or 8-byte offset of DIE, SLEB128 byte displacement DW_OP_LLVM_piece_end 0xea 0 DW_OP_LLVM_extend 0xeb 2 ULEB128 bit size, @@ -3827,8 +4237,16 @@ debugger information entries. Examples ======== -The AMD GPU specific usage of the features in the proposal, including examples, -is available at :ref:`amdgpu-dwarf-debug-information`. +The AMD GPU specific usage of the features in these extensions, including +examples, is available at *User Guide for AMDGPU Backend* section +:ref:`amdgpu-dwarf-debug-information`. + +.. note:: + + Change examples to use ``DW_OP_LLVM_offset`` instead of ``DW_OP_add`` when + acting on a location description. + + Need to provide examples of new features. .. _amdgpu-dwarf-references: diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index af764c0856000..0a29f9e8f4131 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -21,7 +21,7 @@ User Guide for AMDGPU Backend AMDGPUOperandSyntax AMDGPUInstructionSyntax AMDGPUInstructionNotation - AMDGPUDwarfProposalForHeterogeneousDebugging + AMDGPUDwarfExtensionsForHeterogeneousDebugging Introduction ============ @@ -1160,14 +1160,14 @@ DWARF Debug Information .. warning:: - This section describes a **provisional proposal** for AMDGPU DWARF [DWARF]_ - that is not currently fully implemented and is subject to change. + This section describes **provisional support** for AMDGPU DWARF [DWARF]_ that + is not currently fully implemented and is subject to change. AMDGPU generates DWARF [DWARF]_ debugging information ELF sections (see :ref:`amdgpu-elf-code-object`) which contain information that maps the code object executable code and data to the source language constructs. It can be used by tools such as debuggers and profilers. It uses features defined in -:doc:`AMDGPUDwarfProposalForHeterogeneousDebugging` that are made available in +:doc:`AMDGPUDwarfExtensionsForHeterogeneousDebugging` that are made available in DWARF Version 4 and DWARF Version 5 as an LLVM vendor extension. This section defines the AMDGPU target architecture specific DWARF mappings. @@ -1299,8 +1299,8 @@ Address Class Identifier ------------------------ The DWARF address class represents the source language memory space. See DWARF -Version 5 section 2.12 which is updated by the propoal in -:ref:`amdgpu-dwarf-segment_addresses`. +Version 5 section 2.12 which is updated by the *DWARF Extensions For +Heterogeneous Debugging* section :ref:`amdgpu-dwarf-segment_addresses`. The DWARF address class mapping used for AMDGPU is defined in :ref:`amdgpu-dwarf-address-class-mapping-table`. @@ -1321,8 +1321,8 @@ The DWARF address class mapping used for AMDGPU is defined in ``DW_ADDR_AMDGPU_region`` 0x8000 Region (GDS) ========================= ====== ================= -The DWARF address class values defined in the proposal at -:ref:`amdgpu-dwarf-segment_addresses` are used. +The DWARF address class values defined in the *DWARF Extensions For +Heterogeneous Debugging* section :ref:`amdgpu-dwarf-segment_addresses` are used. In addition, ``DW_ADDR_AMDGPU_region`` is encoded as a vendor extension. This is available for use for the AMD extension for access to the hardware GDS memory @@ -1341,8 +1341,8 @@ Address Space Identifier ------------------------ DWARF address spaces correspond to target architecture specific linear -addressable memory areas. See DWARF Version 5 section 2.12 and -:ref:`amdgpu-dwarf-segment_addresses`. +addressable memory areas. See DWARF Version 5 section 2.12 and *DWARF Extensions +For Heterogeneous Debugging* section :ref:`amdgpu-dwarf-segment_addresses`. The DWARF address space mapping used for AMDGPU is defined in :ref:`amdgpu-dwarf-address-space-mapping-table`. @@ -1367,10 +1367,6 @@ The DWARF address space mapping used for AMDGPU is defined in *Reserved* 0x04 ``DW_ASPACE_AMDGPU_private_lane`` 0x05 4 4 Private (Scratch) *focused lane* ``DW_ASPACE_AMDGPU_private_wave`` 0x06 4 4 Private (Scratch) *unswizzled wavefront* - *Reserved* 0x07- - 0x1F - ``DW_ASPACE_AMDGPU_private_lane<0-63>`` 0x20- 4 4 Private (Scratch) *specific lane* - 0x5F ======================================= ===== ======= ======== ================= ======================= See :ref:`amdgpu-address-spaces` for information on the AMDGPU address spaces @@ -1442,12 +1438,6 @@ address may have to be converted as the size of a ``DW_ASPACE_AMDGPU_private_lane`` address may be smaller than the size of a ``DW_ASPACE_AMDGPU_private_wave`` address. -The ``DW_ASPACE_AMDGPU_private_lane`` address space allows location -expressions to specify the private address space corresponding to a specific -lane N. For example, this can be used when the compiler spills scalar registers -to scratch memory, with each scalar register being saved to a different lane's -scratch memory. - .. _amdgpu-dwarf-lane-identifier: Lane identifier @@ -1457,8 +1447,8 @@ DWARF lane identifies specify a target architecture lane position for hardware that executes in a SIMD or SIMT manner, and on which a source language maps its threads of execution onto those lanes. The DWARF lane identifier is pushed by the ``DW_OP_LLVM_push_lane`` DWARF expression operation. See DWARF Version 5 -section 2.5 which is updated by the proposal in -:ref:`amdgpu-dwarf-operation-expressions`. +section 2.5 which is updated by *DWARF Extensions For Heterogeneous Debugging* +section :ref:`amdgpu-dwarf-operation-expressions`. For AMDGPU, the lane identifier corresponds to the hardware lane ID of a wavefront. It is numbered from 0 to the wavefront size minus 1. @@ -1493,7 +1483,8 @@ Debugger Information Entry Attributes This section describes how certain debugger information entry attributes are used by AMDGPU. See the sections in DWARF Version 5 section 2 which are updated -by the proposal in :ref:`amdgpu-dwarf-debugging-information-entry-attributes`. +by *DWARF Extensions For Heterogeneous Debugging* section +:ref:`amdgpu-dwarf-debugging-information-entry-attributes`. .. _amdgpu-dwarf-dw-at-llvm-lane-pc: @@ -1948,8 +1939,8 @@ DWARF Version 5 section 6.2.4): Source text for online-compiled programs (for example, those compiled by the OpenCL language runtime) may be embedded into the DWARF Version 5 line table. -See DWARF Version 5 section 6.2.4.1 which is updated by the proposal in -:ref:`DW_LNCT_LLVM_source +See DWARF Version 5 section 6.2.4.1 which is updated by *DWARF Extensions For +Heterogeneous Debugging* section :ref:`DW_LNCT_LLVM_source `. The Clang option used to control source embedding in AMDGPU is defined in diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst index a9884cd9f3b5a..9c95e3ebf28a3 100644 --- a/llvm/docs/CodingStandards.rst +++ b/llvm/docs/CodingStandards.rst @@ -174,6 +174,16 @@ used as an abstract. Any additional information should be separated by a blank line. If an algorithm is based on a paper or is described in another source, provide a reference. +Header Guard +"""""""""""" + +The header file's guard should be the all-caps path that a user of this header +would #include, using '_' instead of path separator and extension marker. +For example, the header file +``llvm/include/llvm/Analysis/Utils/Local.h`` would be ``#include``-ed as +``#include "llvm/Analysis/Utils/Local.h"``, so its guard is +``LLVM_ANALYSIS_UTILS_LOCAL_H``. + Class overviews """"""""""""""" diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst index d29d293f32d8d..c9ad808e13500 100644 --- a/llvm/docs/CommandGuide/lit.rst +++ b/llvm/docs/CommandGuide/lit.rst @@ -81,6 +81,7 @@ OUTPUT OPTIONS .. option:: -s, --succinct Show less output, for example don't show information on tests that pass. + Also show a progress bar, unless ``--no-progress-bar`` is specificed. .. option:: -v, --verbose diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst index 321cdf5a6dab1..8cc1a237e9969 100644 --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -192,10 +192,24 @@ OPTIONS .. option:: -mode=[latency|uops|inverse_throughput|analysis] - Specify the run mode. Note that if you pick `analysis` mode, you also need - to specify at least one of the `-analysis-clusters-output-file=` and - `-analysis-inconsistencies-output-file=`. + Specify the run mode. Note that some modes have additional requirements and options. + `latency` mode can be make use of either RDTSC or LBR. + `latency[LBR]` is only available on X86 (at least `Skylake`). + To run in this mode, a positive value must be specified for `x86-lbr-sample-period` and `--repetition-mode=loop` + + In `analysis` mode, you also need to specify at least one of the + `-analysis-clusters-output-file=` and `-analysis-inconsistencies-output-file=`. + +.. option:: -x86-lbr-sample-period= + + Specify the LBR sampling period - how many branches before we take a sample. + When a positive value is specified for this option and when the mode is `latency`, + we will use LBRs for measuring. + On choosing the "right" sampling period, a small value is preferred, but throttling + could occur if the sampling is too frequent. A prime number should be used to + avoid consistently skipping certain blocks. + .. option:: -repetition-mode=[duplicate|loop|min] Specify the repetition mode. `duplicate` will create a large, straight line diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst index 13a66dc48cef4..647232020e4b1 100644 --- a/llvm/docs/CommandGuide/llvm-profdata.rst +++ b/llvm/docs/CommandGuide/llvm-profdata.rst @@ -161,6 +161,30 @@ OPTIONS coverage for the optimized target. This option can only be used with sample-based profile in extbinary format. +.. option:: -supplement-instr-with-sample=path_to_sample_profile + + Supplement an instrumentation profile with sample profile. The sample profile + is the input of the flag. Output will be in instrumentation format (only works + with -instr). + +.. option:: -zero-counter-threshold=threshold_float_number + + For the function which is cold in instr profile but hot in sample profile, if + the ratio of the number of zero counters divided by the the total number of + counters is above the threshold, the profile of the function will be regarded + as being harmful for performance and will be dropped. + +.. option:: -instr-prof-cold-threshold=threshold_int_number + + User specified cold threshold for instr profile which will override the cold + threshold got from profile summary. + +.. option:: -suppl-min-size-threshold=threshold_int_number + + If the size of a function is smaller than the threshold, assume it can be + inlined by PGO early inliner and it will not be adjusted based on sample + profile. + EXAMPLES ^^^^^^^^ Basic Usage diff --git a/llvm/docs/CommandLine.rst b/llvm/docs/CommandLine.rst index ab2826d789f2f..431ebc0e67e67 100644 --- a/llvm/docs/CommandLine.rst +++ b/llvm/docs/CommandLine.rst @@ -1369,29 +1369,6 @@ The ``cl::ParseCommandLineOptions`` function requires two parameters (``argc`` and ``argv``), but may also take an optional third parameter which holds `additional extra text`_ to emit when the ``-help`` option is invoked. -.. _cl::ParseEnvironmentOptions: - -The ``cl::ParseEnvironmentOptions`` function -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``cl::ParseEnvironmentOptions`` function has mostly the same effects as -`cl::ParseCommandLineOptions`_, except that it is designed to take values for -options from an environment variable, for those cases in which reading the -command line is not convenient or desired. It fills in the values of all the -command line option variables just like `cl::ParseCommandLineOptions`_ does. - -It takes four parameters: the name of the program (since ``argv`` may not be -available, it can't just look in ``argv[0]``), the name of the environment -variable to examine, and the optional `additional extra text`_ to emit when the -``-help`` option is invoked. - -``cl::ParseEnvironmentOptions`` will break the environment variable's value up -into words and then process them using `cl::ParseCommandLineOptions`_. -**Note:** Currently ``cl::ParseEnvironmentOptions`` does not support quoting, so -an environment variable containing ``-option "foo bar"`` will be parsed as three -words, ``-option``, ``"foo``, and ``bar"``, which is different from what you -would get from the shell with the same input. - The ``cl::SetVersionPrinter`` function ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 6b9c5c6899819..af93a6ed5c56e 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4310,14 +4310,9 @@ PowerPC: - ``r``: A 32 or 64-bit integer register. - ``b``: A 32 or 64-bit integer register, excluding ``R0`` (that is: ``R1-R31``). -- ``f``: A 32 or 64-bit float register (``F0-F31``), or when QPX is enabled, a - 128 or 256-bit QPX register (``Q0-Q31``; aliases the ``F`` registers). -- ``v``: For ``4 x f32`` or ``4 x f64`` types, when QPX is enabled, a - 128 or 256-bit QPX register (``Q0-Q31``), otherwise a 128-bit - altivec vector register (``V0-V31``). - - .. FIXME: is this a bug that v accepts QPX registers? I think this - is supposed to only use the altivec vector registers? +- ``f``: A 32 or 64-bit float register (``F0-F31``), +- ``v``: For ``4 x f32`` or ``4 x f64`` types, a 128-bit altivec vector + register (``V0-V31``). - ``y``: Condition register (``CR0-CR7``). - ``wc``: An individual CR bit in a CR register. diff --git a/llvm/docs/LibFuzzer.rst b/llvm/docs/LibFuzzer.rst index 4e83955a05460..70a3f029c6f3e 100644 --- a/llvm/docs/LibFuzzer.rst +++ b/llvm/docs/LibFuzzer.rst @@ -617,6 +617,35 @@ really need to access ``argv``/``argc``. return 0; } +Using libFuzzer as a library +---------------------------- +If the code being fuzzed must provide its own `main`, it's possible to +invoke libFuzzer as a library. Be sure to pass ``-fsanitize=fuzzer-no-link`` +during compilation, and link your binary against the no-main version of +libFuzzer. On Linux installations, this is typically located at: + +.. code-block:: bash + + /usr/lib//lib/clang//lib/linux/libclang_rt.fuzzer_no_main-.a + +If building libFuzzer from source, this is located at the following path +in the build output directory: + +.. code-block:: bash + + lib/linux/libclang_rt.fuzzer_no_main-.a + +From here, the code can do whatever setup it requires, and when it's ready +to start fuzzing, it can call `LLVMFuzzerRunDriver`, passing in the program +arguments and a callback. This callback is invoked just like +`LLVMFuzzerTestOneInput`, and has the same signature. + +.. code-block:: c++ + + extern "C" int LLVMFuzzerRunDriver(int *argc, char ***argv, + int (*UserCb)(const uint8_t *Data, size_t Size)); + + Leaks ----- diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst index 9a6c6944b96ef..13317ecc966e7 100644 --- a/llvm/docs/Passes.rst +++ b/llvm/docs/Passes.rst @@ -522,9 +522,9 @@ instructions that are obviously dead. A trivial dead store elimination that only considers basic-block local redundant stores. -.. _passes-functionattrs: +.. _passes-function-attrs: -``-functionattrs``: Deduce function attributes +``-function-attrs``: Deduce function attributes ---------------------------------------------- A simple interprocedural pass which walks the call-graph, looking for functions @@ -651,7 +651,7 @@ This pass can also simplify calls to specific well-known function calls (e.g. runtime library functions). For example, a call ``exit(3)`` that occurs within the ``main()`` function can be transformed into simply ``return 3``. Whether or not library calls are simplified is controlled by the -:ref:`-functionattrs ` pass and LLVM's knowledge of +:ref:`-function-attrs ` pass and LLVM's knowledge of library calls on different targets. .. _passes-aggressive-instcombine: @@ -676,15 +676,6 @@ This pass loops over all of the functions in the input module, looking for a main function. If a main function is found, all other functions and all global variables with initializers are marked as internal. -``-ipconstprop``: Interprocedural constant propagation ------------------------------------------------------- - -This pass implements an *extremely* simple interprocedural constant propagation -pass. It could certainly be improved in many different ways, like using a -worklist. This pass makes arguments dead, but does not remove them. The -existing dead argument elimination pass should be run after this to clean up -the mess. - ``-ipsccp``: Interprocedural Sparse Conditional Constant Propagation -------------------------------------------------------------------- diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst index 6fd9ab2d24ca4..4ca1a359b64c9 100644 --- a/llvm/docs/TestingGuide.rst +++ b/llvm/docs/TestingGuide.rst @@ -273,18 +273,18 @@ Extra files If your test requires extra files besides the file containing the ``RUN:`` lines and the extra files are small, consider specifying them in the same file and -using ``extract`` to extract them. For example, +using ``split-file`` to extract them. For example, .. code-block:: llvm - ; RUN: extract b %s -o %tb.ll - ; RUN: extract a %s | llvm-link - %tb.ll -S | FileCheck %s + ; RUN: split-file %s %t + ; RUN: llvm-link -S %t/a.ll %t/b.ll | FileCheck %s ; CHECK: ... - ;--- a + ;--- a.ll ... - ;--- b + ;--- b.ll ... The parts are separated by the regex ``^(.|//)--- ``. By default the diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst index 6e4329128fab4..2e0cffb711ef9 100644 --- a/llvm/docs/UserGuides.rst +++ b/llvm/docs/UserGuides.rst @@ -198,6 +198,6 @@ Additional Topics :doc:`AMDGPUUsage` This document describes using the AMDGPU backend to compile GPU kernels. -:doc:`AMDGPUDwarfProposalForHeterogeneousDebugging` - This document describes a DWARF proposal to support heterogeneous debugging +:doc:`AMDGPUDwarfExtensionsForHeterogeneousDebugging` + This document describes DWARF extensions to support heterogeneous debugging for targets such as the AMDGPU backend. diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst index c322797025fb6..702090447c692 100644 --- a/llvm/docs/Vectorizers.rst +++ b/llvm/docs/Vectorizers.rst @@ -193,7 +193,7 @@ reduction operations, such as addition, multiplication, XOR, AND and OR. .. code-block:: c++ - int foo(int *A, int *B, int n) { + int foo(int *A, int n) { unsigned sum = 0; for (int i = 0; i < n; ++i) sum += A[i] + 5; @@ -210,7 +210,7 @@ array. The Loop Vectorizer knows to vectorize induction variables. .. code-block:: c++ - void bar(float *A, float* B, float K, int n) { + void bar(float *A, int n) { for (int i = 0; i < n; ++i) A[i] = i; } @@ -254,7 +254,7 @@ The Loop Vectorizer can vectorize loops that count backwards. .. code-block:: c++ - int foo(int *A, int *B, int n) { + int foo(int *A, int n) { for (int i = n; i > 0; --i) A[i] +=1; } @@ -284,7 +284,7 @@ vectorization is profitable. .. code-block:: c++ - int foo(int *A, char *B, int n, int k) { + int foo(int *A, char *B, int n) { for (int i = 0; i < n; ++i) A[i] += 4 * B[i]; } @@ -360,7 +360,7 @@ to be used simultaneously. .. code-block:: c++ - int foo(int *A, int *B, int n) { + int foo(int *A, int n) { unsigned sum = 0; for (int i = 0; i < n; ++i) sum += A[i]; diff --git a/llvm/examples/IRTransforms/SimplifyCFG.cpp b/llvm/examples/IRTransforms/SimplifyCFG.cpp index 10658c9f09590..82368d7494212 100644 --- a/llvm/examples/IRTransforms/SimplifyCFG.cpp +++ b/llvm/examples/IRTransforms/SimplifyCFG.cpp @@ -354,18 +354,18 @@ static bool mergeIntoSinglePredecessor_v2(Function &F, DominatorTree &DT) { } static bool doSimplify_v1(Function &F) { - return eliminateCondBranches_v1(F) & mergeIntoSinglePredecessor_v1(F) & + return eliminateCondBranches_v1(F) | mergeIntoSinglePredecessor_v1(F) | removeDeadBlocks_v1(F); } static bool doSimplify_v2(Function &F, DominatorTree &DT) { - return eliminateCondBranches_v2(F, DT) & - mergeIntoSinglePredecessor_v2(F, DT) & removeDeadBlocks_v2(F, DT); + return eliminateCondBranches_v2(F, DT) | + mergeIntoSinglePredecessor_v2(F, DT) | removeDeadBlocks_v2(F, DT); } static bool doSimplify_v3(Function &F, DominatorTree &DT) { - return eliminateCondBranches_v3(F, DT) & - mergeIntoSinglePredecessor_v2(F, DT) & removeDeadBlocks_v2(F, DT); + return eliminateCondBranches_v3(F, DT) | + mergeIntoSinglePredecessor_v2(F, DT) | removeDeadBlocks_v2(F, DT); } namespace { diff --git a/llvm/include/llvm-c/Transforms/IPO.h b/llvm/include/llvm-c/Transforms/IPO.h index cde3d24609208..3f2cadf32366b 100644 --- a/llvm/include/llvm-c/Transforms/IPO.h +++ b/llvm/include/llvm-c/Transforms/IPO.h @@ -57,9 +57,6 @@ void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM); /** See llvm::createGlobalOptimizerPass function. */ void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM); -/** See llvm::createIPConstantPropagationPass function. */ -void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM); - /** See llvm::createPruneEHPass function. */ void LLVMAddPruneEHPass(LLVMPassManagerRef PM); diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index f7df648d27ed6..715f7cd4fdf3f 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -31,6 +31,7 @@ class raw_ostream; template class SmallVectorImpl; template class ArrayRef; template class Optional; +template struct DenseMapInfo; class APInt; @@ -96,7 +97,7 @@ class LLVM_NODISCARD APInt { unsigned BitWidth; ///< The number of bits in this APInt. - friend struct DenseMapAPIntKeyInfo; + friend struct DenseMapInfo; friend class APSInt; diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h index a8d0f07af94a2..2a857786f454f 100644 --- a/llvm/include/llvm/ADT/BitVector.h +++ b/llvm/include/llvm/ADT/BitVector.h @@ -203,9 +203,10 @@ class BitVector { return !any(); } - /// find_first_in - Returns the index of the first set bit in the range - /// [Begin, End). Returns -1 if all bits in the range are unset. - int find_first_in(unsigned Begin, unsigned End) const { + /// find_first_in - Returns the index of the first set / unset bit, + /// depending on \p Set, in the range [Begin, End). + /// Returns -1 if all bits in the range are unset / set. + int find_first_in(unsigned Begin, unsigned End, bool Set = true) const { assert(Begin <= End && End <= Size); if (Begin == End) return -1; @@ -214,8 +215,14 @@ class BitVector { unsigned LastWord = (End - 1) / BITWORD_SIZE; // Check subsequent words. + // The code below is based on search for the first _set_ bit. If + // we're searching for the first _unset_, we just take the + // complement of each word before we use it and apply + // the same method. for (unsigned i = FirstWord; i <= LastWord; ++i) { BitWord Copy = Bits[i]; + if (!Set) + Copy = ~Copy; if (i == FirstWord) { unsigned FirstBit = Begin % BITWORD_SIZE; @@ -266,32 +273,7 @@ class BitVector { /// find_first_unset_in - Returns the index of the first unset bit in the /// range [Begin, End). Returns -1 if all bits in the range are set. int find_first_unset_in(unsigned Begin, unsigned End) const { - assert(Begin <= End && End <= Size); - if (Begin == End) - return -1; - - unsigned FirstWord = Begin / BITWORD_SIZE; - unsigned LastWord = (End - 1) / BITWORD_SIZE; - - // Check subsequent words. - for (unsigned i = FirstWord; i <= LastWord; ++i) { - BitWord Copy = Bits[i]; - - if (i == FirstWord) { - unsigned FirstBit = Begin % BITWORD_SIZE; - Copy |= maskTrailingOnes(FirstBit); - } - - if (i == LastWord) { - unsigned LastBit = (End - 1) % BITWORD_SIZE; - Copy |= maskTrailingZeros(LastBit + 1); - } - if (Copy != ~BitWord(0)) { - unsigned Result = i * BITWORD_SIZE + countTrailingOnes(Copy); - return Result < size() ? Result : -1; - } - } - return -1; + return find_first_in(Begin, End, /* Set = */ false); } /// find_last_unset_in - Returns the index of the last unset bit in the diff --git a/llvm/include/llvm/ADT/DenseMapInfo.h b/llvm/include/llvm/ADT/DenseMapInfo.h index e465331ac6f7b..1cace4b3192fb 100644 --- a/llvm/include/llvm/ADT/DenseMapInfo.h +++ b/llvm/include/llvm/ADT/DenseMapInfo.h @@ -13,6 +13,7 @@ #ifndef LLVM_ADT_DENSEMAPINFO_H #define LLVM_ADT_DENSEMAPINFO_H +#include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/StringRef.h" @@ -347,6 +348,29 @@ template <> struct DenseMapInfo { static bool isEqual(hash_code LHS, hash_code RHS) { return LHS == RHS; } }; +/// Provide DenseMapInfo for APInt. +template <> struct DenseMapInfo { + static inline APInt getEmptyKey() { + APInt V(nullptr, 0); + V.U.VAL = 0; + return V; + } + + static inline APInt getTombstoneKey() { + APInt V(nullptr, 0); + V.U.VAL = 1; + return V; + } + + static unsigned getHashValue(const APInt &Key) { + return static_cast(hash_value(Key)); + } + + static bool isEqual(const APInt &LHS, const APInt &RHS) { + return LHS.getBitWidth() == RHS.getBitWidth() && LHS == RHS; + } +}; + } // end namespace llvm #endif // LLVM_ADT_DENSEMAPINFO_H diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index 2e7a097174763..b9b5e175f50ea 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -1535,6 +1535,13 @@ OutputIt copy(R &&Range, OutputIt Out) { return std::copy(adl_begin(Range), adl_end(Range), Out); } +/// Provide wrappers to std::move which take ranges instead of having to +/// pass begin/end explicitly. +template +OutputIt move(R &&Range, OutputIt Out) { + return std::move(adl_begin(Range), adl_end(Range), Out); +} + /// Wrapper function around std::find to detect if an element exists /// in a container. template @@ -1646,6 +1653,14 @@ void erase_if(Container &C, UnaryPredicate P) { C.erase(remove_if(C, P), C.end()); } +/// Wrapper function to remove a value from a container: +/// +/// C.erase(remove(C.begin(), C.end(), V), C.end()); +template +void erase_value(Container &C, ValueType V) { + C.erase(std::remove(C.begin(), C.end(), V), C.end()); +} + /// Given a sequence container Cont, replace the range [ContIt, ContEnd) with /// the range [ValIt, ValEnd) (which is not from the same container). template diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h index 916fe5272c3a4..154280c217d21 100644 --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -150,8 +150,6 @@ class Triple { Apple, PC, SCEI, - BGP, - BGQ, Freescale, IBM, ImaginationTechnologies, @@ -188,7 +186,6 @@ class Triple { Minix, RTEMS, NaCl, // Native Client - CNK, // BG/P Compute-Node Kernel AIX, CUDA, // NVIDIA CUDA NVCL, // NVIDIA OpenCL diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h index 3e72afba36c30..7feb5b6259380 100644 --- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h +++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h @@ -151,13 +151,66 @@ class BranchProbabilityInfo { /// Forget analysis results for the given basic block. void eraseBlock(const BasicBlock *BB); - // Use to track SCCs for handling irreducible loops. - using SccMap = DenseMap; - using SccHeaderMap = DenseMap; - using SccHeaderMaps = std::vector; - struct SccInfo { + class SccInfo { + // Enum of types to classify basic blocks in SCC. Basic block belonging to + // SCC is 'Inner' until it is either 'Header' or 'Exiting'. Note that a + // basic block can be 'Header' and 'Exiting' at the same time. + enum SccBlockType { + Inner = 0x0, + Header = 0x1, + Exiting = 0x2, + }; + // Map of basic blocks to SCC IDs they belong to. If basic block doesn't + // belong to any SCC it is not in the map. + using SccMap = DenseMap; + // Each basic block in SCC is attributed with one or several types from + // SccBlockType. Map value has uint32_t type (instead of SccBlockType) + // since basic block may be for example "Header" and "Exiting" at the same + // time and we need to be able to keep more than one value from + // SccBlockType. + using SccBlockTypeMap = DenseMap; + // Vector containing classification of basic blocks for all SCCs where i'th + // vector element corresponds to SCC with ID equal to i. + using SccBlockTypeMaps = std::vector; + SccMap SccNums; - SccHeaderMaps SccHeaders; + SccBlockTypeMaps SccBlocks; + + public: + explicit SccInfo(const Function &F); + + /// If \p BB belongs to some SCC then ID of that SCC is returned, otherwise + /// -1 is returned. If \p BB belongs to more than one SCC at the same time + /// result is undefined. + int getSCCNum(const BasicBlock *BB) const; + /// Returns true if \p BB is a 'header' block in SCC with \p SccNum ID, + /// false otherwise. + bool isSCCHeader(const BasicBlock *BB, int SccNum) const { + return getSccBlockType(BB, SccNum) & Header; + } + /// Returns true if \p BB is an 'exiting' block in SCC with \p SccNum ID, + /// false otherwise. + bool isSCCExitingBlock(const BasicBlock *BB, int SccNum) const { + return getSccBlockType(BB, SccNum) & Exiting; + } + /// Fills in \p Enters vector with all such blocks that don't belong to + /// SCC with \p SccNum ID but there is an edge to a block belonging to the + /// SCC. + void getSccEnterBlocks(int SccNum, + SmallVectorImpl &Enters) const; + /// Fills in \p Exits vector with all such blocks that don't belong to + /// SCC with \p SccNum ID but there is an edge from a block belonging to the + /// SCC. + void getSccExitBlocks(int SccNum, + SmallVectorImpl &Exits) const; + + private: + /// Returns \p BB's type according to classification given by SccBlockType + /// enum. Please note that \p BB must belong to SSC with \p SccNum ID. + uint32_t getSccBlockType(const BasicBlock *BB, int SccNum) const; + /// Calculates \p BB's type and stores it in internal data structures for + /// future use. Please note that \p BB must belong to SSC with \p SccNum ID. + void calculateSccBlockType(const BasicBlock *BB, int SccNum); }; private: @@ -196,6 +249,9 @@ class BranchProbabilityInfo { /// Track the last function we run over for printing. const Function *LastF = nullptr; + /// Keeps information about all SCCs in a function. + std::unique_ptr SccI; + /// Track the set of blocks directly succeeded by a returning block. SmallPtrSet PostDominatedByUnreachable; @@ -210,8 +266,7 @@ class BranchProbabilityInfo { bool calcMetadataWeights(const BasicBlock *BB); bool calcColdCallHeuristics(const BasicBlock *BB); bool calcPointerHeuristics(const BasicBlock *BB); - bool calcLoopBranchHeuristics(const BasicBlock *BB, const LoopInfo &LI, - SccInfo &SccI); + bool calcLoopBranchHeuristics(const BasicBlock *BB, const LoopInfo &LI); bool calcZeroHeuristics(const BasicBlock *BB, const TargetLibraryInfo *TLI); bool calcFloatingPointHeuristics(const BasicBlock *BB); bool calcInvokeHeuristics(const BasicBlock *BB); diff --git a/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h b/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h index fb66052851563..8166b52aa226c 100644 --- a/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h +++ b/llvm/include/llvm/Analysis/IteratedDominanceFrontier.h @@ -73,13 +73,7 @@ ChildrenGetterTy::get(const NodeRef &N) { return {Children.begin(), Children.end()}; } - using SnapShotBBPairTy = - std::pair *, OrderedNodeTy>; - - ChildrenTy Ret; - for (const auto &SnapShotBBPair : children({GD, N})) - Ret.emplace_back(SnapShotBBPair.second); - return Ret; + return GD->template getChildren(N); } } // end of namespace IDFCalculatorDetail diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index 5ce2b3fd047f7..3ec09e8c0a45e 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -1214,6 +1214,8 @@ class upward_defs_iterator BasicBlock *getPhiArgBlock() const { return DefIterator.getPhiArgBlock(); } + bool performedPhiTranslation() const { return PerformedPhiTranslation; } + private: void fillInCurrentPair() { CurrentPair.first = *DefIterator; @@ -1226,6 +1228,7 @@ class upward_defs_iterator false)) { if (Translator.getAddr() != Location.Ptr) { CurrentPair.second = Location.getWithNewPtr(Translator.getAddr()); + PerformedPhiTranslation = true; return; } } else { @@ -1240,8 +1243,9 @@ class upward_defs_iterator memoryaccess_def_iterator DefIterator; MemoryLocation Location; MemoryAccess *OriginalAccess = nullptr; - bool WalkingPhi = false; DominatorTree *DT = nullptr; + bool WalkingPhi = false; + bool PerformedPhiTranslation = false; }; inline upward_defs_iterator upward_defs_begin(const MemoryAccessPair &Pair, diff --git a/llvm/include/llvm/Analysis/MemorySSAUpdater.h b/llvm/include/llvm/Analysis/MemorySSAUpdater.h index 20588ef083c59..d41b932099794 100644 --- a/llvm/include/llvm/Analysis/MemorySSAUpdater.h +++ b/llvm/include/llvm/Analysis/MemorySSAUpdater.h @@ -52,8 +52,6 @@ class LoopBlocksRPO; using ValueToValueMapTy = ValueMap; using PhiToDefMap = SmallDenseMap; using CFGUpdate = cfg::Update; -using GraphDiffInvBBPair = - std::pair *, Inverse>; class MemorySSAUpdater { private: diff --git a/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h b/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h index 945d41c376779..156d5a560b9ec 100644 --- a/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h +++ b/llvm/include/llvm/Analysis/ObjCARCAnalysisUtils.h @@ -64,10 +64,9 @@ inline bool ModuleHasARC(const Module &M) { /// This is a wrapper around getUnderlyingObject which also knows how to /// look through objc_retain and objc_autorelease calls, which we know to return /// their argument verbatim. -inline const Value *GetUnderlyingObjCPtr(const Value *V, - const DataLayout &DL) { +inline const Value *GetUnderlyingObjCPtr(const Value *V) { for (;;) { - V = GetUnderlyingObject(V, DL); + V = getUnderlyingObject(V); if (!IsForwarding(GetBasicARCInstKind(V))) break; V = cast(V)->getArgOperand(0); @@ -78,12 +77,12 @@ inline const Value *GetUnderlyingObjCPtr(const Value *V, /// A wrapper for GetUnderlyingObjCPtr used for results memoization. inline const Value * -GetUnderlyingObjCPtrCached(const Value *V, const DataLayout &DL, +GetUnderlyingObjCPtrCached(const Value *V, DenseMap &Cache) { if (auto InCache = Cache.lookup(V)) return InCache; - const Value *Computed = GetUnderlyingObjCPtr(V, DL); + const Value *Computed = GetUnderlyingObjCPtr(V); Cache[V] = const_cast(Computed); return Computed; } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index e1426e3e5192a..092c9744a21bc 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1021,10 +1021,47 @@ class TargetTransformInfo { int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index = 0, VectorType *SubTp = nullptr) const; + /// Represents a hint about the context in which a cast is used. + /// + /// For zext/sext, the context of the cast is the operand, which must be a + /// load of some kind. For trunc, the context is of the cast is the single + /// user of the instruction, which must be a store of some kind. + /// + /// This enum allows the vectorizer to give getCastInstrCost an idea of the + /// type of cast it's dealing with, as not every cast is equal. For instance, + /// the zext of a load may be free, but the zext of an interleaving load can + //// be (very) expensive! + /// + /// See \c getCastContextHint to compute a CastContextHint from a cast + /// Instruction*. Callers can use it if they don't need to override the + /// context and just want it to be calculated from the instruction. + /// + /// FIXME: This handles the types of load/store that the vectorizer can + /// produce, which are the cases where the context instruction is most + /// likely to be incorrect. There are other situations where that can happen + /// too, which might be handled here but in the long run a more general + /// solution of costing multiple instructions at the same times may be better. + enum class CastContextHint : uint8_t { + None, ///< The cast is not used with a load/store of any kind. + Normal, ///< The cast is used with a normal load/store. + Masked, ///< The cast is used with a masked load/store. + GatherScatter, ///< The cast is used with a gather/scatter. + Interleave, ///< The cast is used with an interleaved load/store. + Reversed, ///< The cast is used with a reversed load/store. + }; + + /// Calculates a CastContextHint from \p I. + /// This should be used by callers of getCastInstrCost if they wish to + /// determine the context from some instruction. + /// \returns the CastContextHint for ZExt/SExt/Trunc, None if \p I is nullptr, + /// or if it's another type of cast. + static CastContextHint getCastContextHint(const Instruction *I); + /// \return The expected cost of cast instructions, such as bitcast, trunc, /// zext, etc. If there is an existing instruction that holds Opcode, it /// may be passed in the 'I' parameter. int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, const Instruction *I = nullptr) const; @@ -1454,6 +1491,7 @@ class TargetTransformInfo::Concept { virtual int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp) = 0; virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) = 0; virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst, @@ -1882,9 +1920,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getShuffleCost(Kind, Tp, Index, SubTp); } int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::TargetCostKind CostKind, + CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) override { - return Impl.getCastInstrCost(Opcode, Dst, Src, CostKind, I); + return Impl.getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); } int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 73e5ff60a4be6..4dc0a90cc9db1 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -423,6 +423,7 @@ class TargetTransformInfoImplBase { } unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { switch (Opcode) { @@ -915,7 +916,8 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { case Instruction::SExt: case Instruction::ZExt: case Instruction::AddrSpaceCast: - return TargetTTI->getCastInstrCost(Opcode, Ty, OpTy, CostKind, I); + return TargetTTI->getCastInstrCost( + Opcode, Ty, OpTy, TTI::getCastContextHint(I), CostKind, I); case Instruction::Store: { auto *SI = cast(U); Type *ValTy = U->getOperand(0)->getType(); diff --git a/llvm/include/llvm/Analysis/Utils/TFUtils.h b/llvm/include/llvm/Analysis/Utils/TFUtils.h index b93ca88ec1c93..d4450276a22ee 100644 --- a/llvm/include/llvm/Analysis/Utils/TFUtils.h +++ b/llvm/include/llvm/Analysis/Utils/TFUtils.h @@ -13,6 +13,7 @@ #ifdef LLVM_HAVE_TF_API #include "llvm/IR/LLVMContext.h" +#include "llvm/Support/JSON.h" #include #include @@ -36,6 +37,53 @@ namespace llvm { class TFModelEvaluatorImpl; class EvaluationResultImpl; +/// TensorSpec encapsulates the specification of a tensor: its dimensions, or +/// "shape" (row-major), its type (see TensorSpec::getDataType specializations +/// for supported types), its name and port (see "TensorFlow: Large-Scale +/// Machine Learning on Heterogeneous Distributed Systems", section 4.2, para 2: +/// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) +/// +/// TensorSpec is used to set up a TFModelEvaluator by describing the expected +/// inputs and outputs. +class TensorSpec final { +public: + template + static TensorSpec createSpec(const std::string &Name, + const std::vector &Shape, + int Port = 0) { + return TensorSpec(Name, Port, getDataType(), Shape); + } + + const std::string &name() const { return Name; } + int port() const { return Port; } + int typeIndex() const { return TypeIndex; } + const std::vector &shape() const { return Shape; } + + bool operator==(const TensorSpec &Other) const { + return Name == Other.Name && Port == Other.Port && + TypeIndex == Other.TypeIndex && Shape == Other.Shape; + } + + bool operator!=(const TensorSpec &Other) const { return !(*this == Other); } + +private: + TensorSpec(const std::string &Name, int Port, int TypeIndex, + const std::vector &Shape) + : Name(Name), Port(Port), TypeIndex(TypeIndex), Shape(Shape) {} + + template static int getDataType() { + llvm_unreachable("Undefined tensor type"); + } + + std::string Name; + int Port = 0; + int TypeIndex = 0; + std::vector Shape; +}; + +Optional getTensorSpecFromJSON(LLVMContext &Ctx, + const json::Value &Value); + class TFModelEvaluator final { public: /// The result of a model evaluation. Handles the lifetime of the output @@ -60,8 +108,8 @@ class TFModelEvaluator final { }; TFModelEvaluator(StringRef SavedModelPath, - const std::vector &InputNames, - const std::vector &OutputNames, + const std::vector &InputSpecs, + const std::vector &OutputSpecs, const char *Tags = "serve"); ~TFModelEvaluator(); TFModelEvaluator(const TFModelEvaluator &) = delete; @@ -82,33 +130,33 @@ class TFModelEvaluator final { /// otherwise. bool isValid() const { return !!Impl; } - /// Initialize the input at Index as a tensor of the given type and - /// dimensions. - template - void initInput(size_t Index, const std::vector &Dimensions) { - return initInput(Index, getModelTypeIndex(), Dimensions); - } - private: void *getUntypedInput(size_t Index); - template int getModelTypeIndex(); - void initInput(size_t Index, int TypeIndex, - const std::vector &Dimensions); - std::unique_ptr Impl; }; -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); -template <> int TFModelEvaluator::getModelTypeIndex(); - +/// List of supported types, as a triple: +/// C++ type +/// short name (for strings, for instance) +/// capitalized short name (for enums, for instance) +#define TFUTILS_SUPPORTED_TYPES(M) \ + M(float, float, FLOAT) \ + M(double, double, DOUBLE) \ + M(int8_t, int8, INT8) \ + M(uint8_t, uint8, UINT8) \ + M(int16_t, int16, INT16) \ + M(uint16_t, uint16, UINT16) \ + M(int32_t, int32, INT32) \ + M(uint32_t, uint32, UINT32) \ + M(int64_t, int64, INT64) \ + M(uint64_t, uint64, UINT64) + +#define TFUTILS_GETDATATYPE_DEF(T, S, C) \ + template <> int TensorSpec::getDataType(); + +TFUTILS_SUPPORTED_TYPES(TFUTILS_GETDATATYPE_DEF) + +#undef TFUTILS_GETDATATYPE_DEF } // namespace llvm #endif // LLVM_HAVE_TF_API diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index ce5aea2e8d34d..0f95ad4b6c0ab 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -28,6 +28,7 @@ namespace llvm { class AddOperator; +class AllocaInst; class APInt; class AssumptionCache; class DominatorTree; @@ -367,14 +368,13 @@ class Value; /// that the returned value has pointer type if the specified value does. If /// the MaxLookup value is non-zero, it limits the number of instructions to /// be stripped off. - Value *GetUnderlyingObject(Value *V, const DataLayout &DL, - unsigned MaxLookup = 6); - inline const Value *GetUnderlyingObject(const Value *V, const DataLayout &DL, + Value *getUnderlyingObject(Value *V, unsigned MaxLookup = 6); + inline const Value *getUnderlyingObject(const Value *V, unsigned MaxLookup = 6) { - return GetUnderlyingObject(const_cast(V), DL, MaxLookup); + return getUnderlyingObject(const_cast(V), MaxLookup); } - /// This method is similar to GetUnderlyingObject except that it can + /// This method is similar to getUnderlyingObject except that it can /// look through phi and select instructions and return multiple objects. /// /// If LoopInfo is passed, loop phis are further analyzed. If a pointer @@ -402,20 +402,28 @@ class Value; /// Since A[i] and A[i-1] are independent pointers, getUnderlyingObjects /// should not assume that Curr and Prev share the same underlying object thus /// it shouldn't look through the phi above. - void GetUnderlyingObjects(const Value *V, + void getUnderlyingObjects(const Value *V, SmallVectorImpl &Objects, - const DataLayout &DL, LoopInfo *LI = nullptr, - unsigned MaxLookup = 6); + LoopInfo *LI = nullptr, unsigned MaxLookup = 6); - /// This is a wrapper around GetUnderlyingObjects and adds support for basic + /// This is a wrapper around getUnderlyingObjects and adds support for basic /// ptrtoint+arithmetic+inttoptr sequences. bool getUnderlyingObjectsForCodeGen(const Value *V, - SmallVectorImpl &Objects, - const DataLayout &DL); + SmallVectorImpl &Objects); + + /// Finds alloca where the value comes from. + AllocaInst *findAllocaForValue(Value *V); + inline const AllocaInst *findAllocaForValue(const Value *V) { + return findAllocaForValue(const_cast(V)); + } /// Return true if the only users of this pointer are lifetime markers. bool onlyUsedByLifetimeMarkers(const Value *V); + /// Return true if the only users of this pointer are lifetime markers or + /// droppable instructions. + bool onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V); + /// Return true if speculation of the given load must be suppressed to avoid /// ordering or interfering with an active sanitizer. If not suppressed, /// dereferenceability and alignment must be proven separately. Note: This diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index b1d7850442fba..a860652b2332a 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -14,12 +14,12 @@ #define LLVM_ANALYSIS_VECTORUTILS_H #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LoopAccessAnalysis.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Support/CheckedArithmetic.h" namespace llvm { +class TargetLibraryInfo; /// Describes the type of Parameters enum class VFParamKind { diff --git a/llvm/include/llvm/Bitcode/BitcodeCommon.h b/llvm/include/llvm/Bitcode/BitcodeCommon.h new file mode 100644 index 0000000000000..6a3e74550bc48 --- /dev/null +++ b/llvm/include/llvm/Bitcode/BitcodeCommon.h @@ -0,0 +1,30 @@ +//===- BitcodeCommon.h - Common code for encode/decode --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header defines common code to be used by BitcodeWriter and +// BitcodeReader. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_BITCODE_BITCODECOMMON_H +#define LLVM_BITCODE_BITCODECOMMON_H + +#include "llvm/ADT/Bitfields.h" + +namespace llvm { + +struct AllocaPackedValues { + using Align = Bitfield::Element; + using UsedWithInAlloca = Bitfield::Element; + using ExplicitType = Bitfield::Element; + using SwiftError = Bitfield::Element; +}; + +} // namespace llvm + +#endif // LLVM_BITCODE_BITCODECOMMON_H diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 9ee1d13a34d67..9e5c45084c599 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -222,7 +222,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { - return getTLI()->isNoopAddrSpaceCast(FromAS, ToAS); + return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); } Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, @@ -716,9 +716,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { - if (BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I) == 0) + if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) return 0; const TargetLoweringBase *TLI = getTLI(); @@ -756,15 +757,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return 0; LLVM_FALLTHROUGH; case Instruction::SExt: - if (!I) - break; - - if (getTLI()->isExtFree(I)) + if (I && getTLI()->isExtFree(I)) return 0; // If this is a zext/sext of a load, return 0 if the corresponding // extending load exists on target. - if (I && isa(I->getOperand(0))) { + if (CCH == TTI::CastContextHint::Normal) { EVT ExtVT = EVT::getEVT(Dst); EVT LoadVT = EVT::getEVT(Src); unsigned LType = @@ -839,7 +837,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { unsigned SplitCost = (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; return SplitCost + - (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, + (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, CostKind, I)); } @@ -847,7 +845,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // the operation will get scalarized. unsigned Num = cast(DstVTy)->getNumElements(); unsigned Cost = thisT()->getCastInstrCost( - Opcode, Dst->getScalarType(), Src->getScalarType(), CostKind, I); + Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. @@ -872,7 +870,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, Index) + thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), - TTI::TCK_RecipThroughput); + TTI::CastContextHint::None, TTI::TCK_RecipThroughput); } unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { @@ -1457,6 +1455,26 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { VecOpTy, cast(CmpInst::makeCmpResultType(VecOpTy)), /*IsPairwiseForm=*/false, /*IsUnsigned=*/true, CostKind); + case Intrinsic::abs: + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: { + // abs(X) = select(icmp(X,0),X,sub(0,X)) + // minmax(X,Y) = select(icmp(X,Y),X,Y) + Type *CondTy = RetTy->getWithNewBitWidth(1); + unsigned Cost = 0; + // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code. + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, + CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, + CostKind); + // TODO: Should we add an OperandValueProperties::OP_Zero property? + if (IID == Intrinsic::abs) + Cost += thisT()->getArithmeticInstrCost( + BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); + return Cost; + } case Intrinsic::sadd_sat: case Intrinsic::ssub_sat: { Type *CondTy = RetTy->getWithNewBitWidth(1); @@ -1502,13 +1520,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { unsigned ExtOp = IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; + TTI::CastContextHint CCH = TTI::CastContextHint::None; unsigned Cost = 0; - Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CostKind); + Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); Cost += thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, - CostKind); + CCH, CostKind); Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, CostKind, TTI::OK_AnyValue, TTI::OK_UniformConstantValue); @@ -1567,13 +1586,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { unsigned ExtOp = IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; + TTI::CastContextHint CCH = TTI::CastContextHint::None; unsigned Cost = 0; - Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CostKind); + Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); Cost += thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, - CostKind); + CCH, CostKind); Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy, CostKind, TTI::OK_AnyValue, TTI::OK_UniformConstantValue); diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h index c99ca00eac29f..b6bde0249f883 100644 --- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h +++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h @@ -91,13 +91,33 @@ class FunctionLoweringInfo { /// Track virtual registers created for exception pointers. DenseMap CatchPadExceptionPointers; - /// Keep track of frame indices allocated for statepoints as they could be - /// used across basic block boundaries (e.g. for an invoke). For each - /// gc.statepoint instruction, maps uniqued llvm IR values to the slots they - /// were spilled in. If a value is mapped to None it means we visited the - /// value but didn't spill it (because it was a constant, for instance). - using StatepointSpillMapTy = DenseMap>; - DenseMap StatepointSpillMaps; + /// Helper object to track which of three possible relocation mechanisms are + /// used for a particular value being relocated over a statepoint. + struct StatepointRelocationRecord { + enum RelocType { + // Value did not need to be relocated and can be used directly. + NoRelocate, + // Value was spilled to stack and needs filled at the gc.relocate. + Spill, + // Value was lowered to tied def and gc.relocate should be replaced with + // copy from vreg. + VReg, + } type = NoRelocate; + // Payload contains either frame index of the stack slot in which the value + // was spilled, or virtual register which contains the re-definition. + union payload_t { + payload_t() : FI(-1) {} + int FI; + Register Reg; + } payload; + }; + + /// Keep track of each value which was relocated and the strategy used to + /// relocate that value. This information is required when visiting + /// gc.relocates which may appear in following blocks. + using StatepointSpillMapTy = + DenseMap; + DenseMap StatepointRelocationMaps; /// StaticAllocaMap - Keep track of frame indices for fixed sized allocas in /// the entry block. This allows the allocas to be efficiently referenced diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index e694e7ad2c834..3847112d51836 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -200,6 +200,14 @@ class CombinerHelper { bool applyCombineShiftToUnmerge(MachineInstr &MI, const unsigned &ShiftVal); bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount); + /// Transform IntToPtr(PtrToInt(x)) to x if cast is in the same address space. + bool matchCombineI2PToP2I(MachineInstr &MI, Register &Reg); + bool applyCombineI2PToP2I(MachineInstr &MI, Register &Reg); + + /// Transform PtrToInt(IntToPtr(x)) to x. + bool matchCombineP2IToI2P(MachineInstr &MI, Register &Reg); + bool applyCombineP2IToI2P(MachineInstr &MI, Register &Reg); + /// Return true if any explicit use operand on \p MI is defined by a /// G_IMPLICIT_DEF. bool matchAnyExplicitUseIsUndef(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index 751ab67c4e973..928743a6cbd7d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -223,6 +223,8 @@ class IRTranslator : public MachineFunctionPass { bool translateOverflowIntrinsic(const CallInst &CI, unsigned Op, MachineIRBuilder &MIRBuilder); + bool translateFixedPointIntrinsic(unsigned Op, const CallInst &CI, + MachineIRBuilder &MIRBuilder); /// Helper function for translateSimpleIntrinsic. /// \return The generic opcode for \p IntrinsicID if \p IntrinsicID is a diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 0a490249f1568..dfd27bd5f7c5f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -154,6 +154,10 @@ class LegalizerHelper { /// def by inserting a G_BITCAST from \p CastTy void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx); + /// Widen \p OrigReg to \p WideTy by merging to a wider type, padding with + /// G_IMPLICIT_DEF, and producing dead results. + Register widenWithUnmerge(LLT WideTy, Register OrigReg); + private: LegalizeResult widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy); @@ -228,7 +232,23 @@ class LegalizerHelper { ArrayRef Src1Regs, ArrayRef Src2Regs, LLT NarrowTy); + void changeOpcode(MachineInstr &MI, unsigned NewOpcode); + public: + /// Return the alignment to use for a stack temporary object with the given + /// type. + Align getStackTemporaryAlignment(LLT Type, Align MinAlign = Align()) const; + + /// Create a stack temporary based on the size in bytes and the alignment + MachineInstrBuilder createStackTemporary(TypeSize Bytes, Align Alignment, + MachinePointerInfo &PtrInfo); + + /// Get a pointer to vector element \p Index located in memory for a vector of + /// type \p VecTy starting at a base address of \p VecPtr. If \p Index is out + /// of bounds the returned pointer is unspecified, but will be within the + /// vector bounds. + Register getVectorElementPointer(Register VecPtr, LLT VecTy, Register Index); + LegalizeResult fewerElementsVectorImplicitDef(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy); @@ -291,6 +311,10 @@ class LegalizerHelper { LegalizeResult narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + /// Perform Bitcast legalize action on G_EXTRACT_VECTOR_ELT. + LegalizeResult bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy); + LegalizeResult lowerBitcast(MachineInstr &MI); LegalizeResult lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty); @@ -312,6 +336,7 @@ class LegalizerHelper { LegalizeResult lowerFFloor(MachineInstr &MI); LegalizeResult lowerMergeValues(MachineInstr &MI); LegalizeResult lowerUnmergeValues(MachineInstr &MI); + LegalizeResult lowerExtractVectorElt(MachineInstr &MI); LegalizeResult lowerShuffleVector(MachineInstr &MI); LegalizeResult lowerDynStackAlloc(MachineInstr &MI); LegalizeResult lowerExtract(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index b50d7cd7a5f91..57cc693e1817b 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -218,11 +218,19 @@ Predicate any(Predicate P0, Predicate P1, Args... args) { return any(any(P0, P1), args...); } -/// True iff the given type index is the specified types. +/// True iff the given type index is the specified type. LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit); /// True iff the given type index is one of the specified types. LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list TypesInit); + +/// True iff the given type index is not the specified type. +inline LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx] != Type; + }; +} + /// True iff the given types for the given pair of type indexes is one of the /// specified type pairs. LegalityPredicate @@ -659,6 +667,15 @@ class LegalizeRuleSet { Types2); } + /// The instruction is emitted as a library call. + LegalizeRuleSet &libcall() { + using namespace LegalizeMutations; + // We have no choice but conservatively assume that predicate-less lowering + // properly handles all type indices by design: + markAllIdxsAsCovered(); + return actionIf(LegalizeAction::Libcall, always); + } + /// Like legalIf, but for the Libcall action. LegalizeRuleSet &libcallIf(LegalityPredicate Predicate) { // We have no choice but conservatively assume that a libcall with a diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 3ad2effea1b61..cd0dcefb49c7e 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -813,7 +813,17 @@ class MachineIRBuilder { /// /// \return a MachineInstrBuilder for the newly created instruction. MachineInstrBuilder buildLoad(const DstOp &Res, const SrcOp &Addr, - MachineMemOperand &MMO); + MachineMemOperand &MMO) { + return buildLoadInstr(TargetOpcode::G_LOAD, Res, Addr, MMO); + } + + /// Build and insert a G_LOAD instruction, while constructing the + /// MachineMemOperand. + MachineInstrBuilder + buildLoad(const DstOp &Res, const SrcOp &Addr, MachinePointerInfo PtrInfo, + Align Alignment, + MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, + const AAMDNodes &AAInfo = AAMDNodes()); /// Build and insert `Res = Addr, MMO`. /// @@ -847,6 +857,14 @@ class MachineIRBuilder { MachineInstrBuilder buildStore(const SrcOp &Val, const SrcOp &Addr, MachineMemOperand &MMO); + /// Build and insert a G_STORE instruction, while constructing the + /// MachineMemOperand. + MachineInstrBuilder + buildStore(const SrcOp &Val, const SrcOp &Addr, MachinePointerInfo PtrInfo, + Align Alignment, + MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone, + const AAMDNodes &AAInfo = AAMDNodes()); + /// Build and insert `Res0, ... = G_EXTRACT Src, Idx0`. /// /// \pre setBasicBlock or setMI must have been called. diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index 8eb95a8899913..35add316b5b67 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -213,5 +213,9 @@ LLT getLCMType(LLT OrigTy, LLT TargetTy); LLVM_READNONE LLT getGCDType(LLT OrigTy, LLT TargetTy); +/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat. +/// If \p MI is not a splat, returns None. +Optional getSplatIndex(MachineInstr &MI); + } // End namespace llvm. #endif diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index d6cb7211cf70e..0360e706cbc4a 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -872,6 +872,14 @@ class MachineBasicBlock void print(raw_ostream &OS, ModuleSlotTracker &MST, const SlotIndexes * = nullptr, bool IsStandalone = true) const; + enum PrintNameFlag { + PrintNameIr = (1 << 0), ///< Add IR name where available + PrintNameAttributes = (1 << 1), ///< Print attributes + }; + + void printName(raw_ostream &os, unsigned printNameFlags = PrintNameIr, + ModuleSlotTracker *moduleSlotTracker = nullptr) const; + // Printing method used by LoopInfo. void printAsOperand(raw_ostream &OS, bool PrintType = true) const; diff --git a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h index 7ce11c784b08f..6c442d3d07bdb 100644 --- a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h +++ b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h @@ -74,7 +74,11 @@ class MachineBlockFrequencyInfo : public MachineFunctionPass { bool isIrrLoopHeader(const MachineBasicBlock *MBB) const; - void setBlockFreq(const MachineBasicBlock *MBB, uint64_t Freq); + /// incrementally calculate block frequencies when we split edges, to avoid + /// full CFG traversal. + void onEdgeSplit(const MachineBasicBlock &NewPredecessor, + const MachineBasicBlock &NewSuccessor, + const MachineBranchProbabilityInfo &MBPI); const MachineFunction *getFunction() const; const MachineBranchProbabilityInfo *getMBPI() const; diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h index cabb9f1c97c96..b31e9cdb0e903 100644 --- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h +++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h @@ -40,20 +40,30 @@ class MDNode; namespace RegState { - enum { - Define = 0x2, - Implicit = 0x4, - Kill = 0x8, - Dead = 0x10, - Undef = 0x20, - EarlyClobber = 0x40, - Debug = 0x80, - InternalRead = 0x100, - Renamable = 0x200, - DefineNoRead = Define | Undef, - ImplicitDefine = Implicit | Define, - ImplicitKill = Implicit | Kill - }; +enum { + /// Register definition. + Define = 0x2, + /// Not emitted register (e.g. carry, or temporary result). + Implicit = 0x4, + /// The last use of a register. + Kill = 0x8, + /// Unused definition. + Dead = 0x10, + /// Value of the register doesn't matter. + Undef = 0x20, + /// Register definition happens before uses. + EarlyClobber = 0x40, + /// Register 'use' is for debugging purpose. + Debug = 0x80, + /// Register reads a value that is defined inside the same instruction or + /// bundle. + InternalRead = 0x100, + /// Register that may be renamed. + Renamable = 0x200, + DefineNoRead = Define | Undef, + ImplicitDefine = Implicit | Define, + ImplicitKill = Implicit | Kill +}; } // end namespace RegState diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index c3a11b1996759..d6580430daf73 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -134,6 +134,12 @@ class TargetFrameLowering { /// was called). virtual unsigned getStackAlignmentSkew(const MachineFunction &MF) const; + /// This method returns whether or not it is safe for an object with the + /// given stack id to be bundled into the local area. + virtual bool isStackIdSafeForLocalArea(unsigned StackId) const { + return true; + } + /// getOffsetOfLocalArea - This method returns the offset of the local area /// from the stack pointer on entrance to a function. /// diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index a92761abd2f82..f93b11bd1c2cb 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1759,17 +1759,10 @@ class TargetLoweringBase { return ""; } - /// Returns true if a cast between SrcAS and DestAS is a noop. - virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - return false; - } - /// Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g. we /// are happy to sink it into basic blocks. A cast may be free, but not /// necessarily a no-op. e.g. a free truncate from a 64-bit to 32-bit pointer. - virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - return isNoopAddrSpaceCast(SrcAS, DestAS); - } + virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const; /// Return true if the pointer arguments to CI should be aligned by aligning /// the object whose address is being passed. If so then MinSize is set to the @@ -4189,7 +4182,7 @@ class TargetLowering : public TargetLoweringBase { // Lower custom output constraints. If invalid, return SDValue(). virtual SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, - SDLoc DL, + const SDLoc &DL, const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const; @@ -4270,7 +4263,7 @@ class TargetLowering : public TargetLoweringBase { /// \param RL Low bits of the RHS of the MUL. See LL for meaning /// \param RH High bits of the RHS of the MUL. See LL for meaning. /// \returns true if the node has been expanded, false if it has not - bool expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, SDValue LHS, + bool expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl, SDValue LHS, SDValue RHS, SmallVectorImpl &Result, EVT HiLoVT, SelectionDAG &DAG, MulExpansionKind Kind, SDValue LL = SDValue(), SDValue LH = SDValue(), diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index 298fab318c10d..b8c7e070eb341 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -1,6 +1,9 @@ #ifndef CONFIG_H #define CONFIG_H +// Include this header only under the llvm source tree. +// This is a private header. + /* Exported configuration */ #include "llvm/Config/llvm-config.h" @@ -97,6 +100,9 @@ /* Define to 1 if you have the `pfm' library (-lpfm). */ #cmakedefine HAVE_LIBPFM ${HAVE_LIBPFM} +/* Define to 1 if the `perf_branch_entry' struct has field cycles. */ +#cmakedefine LIBPFM_HAS_FIELD_CYCLES ${LIBPFM_HAS_FIELD_CYCLES} + /* Define to 1 if you have the `psapi' library (-lpsapi). */ #cmakedefine HAVE_LIBPSAPI ${HAVE_LIBPSAPI} @@ -332,9 +338,6 @@ /* Whether GlobalISel rule coverage is being collected */ #cmakedefine01 LLVM_GISEL_COV_ENABLED -/* Define if we have z3 and want to build it */ -#cmakedefine LLVM_WITH_Z3 ${LLVM_WITH_Z3} - /* Define to the default GlobalISel coverage file prefix */ #cmakedefine LLVM_GISEL_COV_PREFIX "${LLVM_GISEL_COV_PREFIX}" diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake index 01892bc5b610b..ee299876825ea 100644 --- a/llvm/include/llvm/Config/llvm-config.h.cmake +++ b/llvm/include/llvm/Config/llvm-config.h.cmake @@ -79,6 +79,9 @@ */ #cmakedefine01 LLVM_FORCE_ENABLE_STATS +/* Define if we have z3 and want to build it */ +#cmakedefine LLVM_WITH_Z3 ${LLVM_WITH_Z3} + /* Define if LLVM was built with a dependency to the libtensorflow dynamic library */ #cmakedefine LLVM_HAVE_TF_API diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h index 76f9dea4160f7..5e22837fa9309 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h @@ -395,6 +395,10 @@ class Symbol { return Name; } + /// Rename this symbol. The client is responsible for updating scope and + /// linkage if this name-change requires it. + void setName(StringRef Name) { this->Name = Name; } + /// Returns true if this Symbol has content (potentially) defined within this /// object file (i.e. is anything but an external or absolute symbol). bool isDefined() const { @@ -959,7 +963,7 @@ class LinkGraph { Section &Sec = Sym.getBlock().getSection(); Sec.removeSymbol(Sym); } - Sym.makeExternal(createAddressable(false)); + Sym.makeExternal(createAddressable(0, false)); ExternalSymbols.insert(&Sym); } @@ -1279,7 +1283,11 @@ class JITLinkContext { /// their final memory locations in the target process. At this point the /// LinkGraph can be inspected to build a symbol table, however the block /// content will not generally have been copied to the target location yet. - virtual void notifyResolved(LinkGraph &G) = 0; + /// + /// If the client detects an error in the LinkGraph state (e.g. unexpected or + /// missing symbols) they may return an error here. The error will be + /// propagated to notifyFailed and the linker will bail out. + virtual Error notifyResolved(LinkGraph &G) = 0; /// Called by JITLink to notify the context that the object has been /// finalized (i.e. emitted to memory and memory permissions set). If all of diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h b/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h index 6c95e22a4257d..d35c8abc84a2e 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.h @@ -27,10 +27,9 @@ class TPCDynamicLibrarySearchGenerator : public JITDylib::DefinitionGenerator { /// If the Allow predicate is given then only symbols matching the predicate /// will be searched for. If the predicate is not given then all symbols will /// be searched for. - TPCDynamicLibrarySearchGenerator( - TargetProcessControl &TPC, - TargetProcessControl::DynamicLibraryHandle DylibHandle) - : TPC(TPC), DylibHandle(DylibHandle) {} + TPCDynamicLibrarySearchGenerator(TargetProcessControl &TPC, + TargetProcessControl::DylibHandle H) + : TPC(TPC), H(H) {} /// Permanently loads the library at the given path and, on success, returns /// a DynamicLibrarySearchGenerator that will search it for symbol definitions @@ -51,7 +50,7 @@ class TPCDynamicLibrarySearchGenerator : public JITDylib::DefinitionGenerator { private: TargetProcessControl &TPC; - TargetProcessControl::DynamicLibraryHandle DylibHandle; + TargetProcessControl::DylibHandle H; }; } // end namespace orc diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h index 37bfa5a45b6c0..e260c64bee512 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcessControl.h @@ -110,15 +110,18 @@ class TargetProcessControl { } }; - using DynamicLibraryHandle = JITTargetAddress; - - /// Request lookup within a single library. - /// If Library is None then the whole target process should be searched. + /// A handle for a library opened via loadDylib. + /// + /// Note that this handle does not necessarily represent a JITDylib: it may + /// be a regular dynamic library or shared object (e.g. one opened via a + /// dlopen in the target process). + using DylibHandle = JITTargetAddress; + + /// Request lookup within the given DylibHandle. struct LookupRequestElement { - LookupRequestElement(DynamicLibraryHandle Handle, - const SymbolLookupSet &Symbols) + LookupRequestElement(DylibHandle Handle, const SymbolLookupSet &Symbols) : Handle(Handle), Symbols(Symbols) {} - DynamicLibraryHandle Handle; + DylibHandle Handle; const SymbolLookupSet &Symbols; }; @@ -140,11 +143,10 @@ class TargetProcessControl { /// Return a MemoryAccess object for the target process. MemoryAccess &getMemoryAccess() const { return *MemAccess; } - /// Load the library at the given path. Returns a handle to the loaded - /// library. If LibraryPath is null this function will return the global - /// handle for the target process. - virtual Expected - loadLibrary(const char *LibraryPath) = 0; + /// Load the dynamic library at the given path and return a handle to it. + /// If LibraryPath is null this function will return the global handle for + /// the target process. + virtual Expected loadDylib(const char *DylibPath) = 0; /// Search for symbols in the target process. /// The result of the lookup is a 2-dimentional array of target addresses @@ -152,7 +154,6 @@ class TargetProcessControl { virtual Expected lookupSymbols(LookupRequest Request) = 0; protected: - TargetProcessControl(Triple TT, unsigned PageSize); Triple TT; unsigned PageSize = 0; @@ -168,7 +169,7 @@ class SelfTargetProcessControl : public TargetProcessControl, static Expected> Create(); - Expected loadLibrary(const char *LibraryPath) override; + Expected loadDylib(const char *DylibPath) override; Expected lookupSymbols(LookupRequest Request) override; diff --git a/llvm/include/llvm/Frontend/OpenACC/ACC.td b/llvm/include/llvm/Frontend/OpenACC/ACC.td index e96b7e8466628..4dbf4bbc0fb0b 100644 --- a/llvm/include/llvm/Frontend/OpenACC/ACC.td +++ b/llvm/include/llvm/Frontend/OpenACC/ACC.td @@ -601,4 +601,4 @@ def ACC_SerialLoop : Directive<"serial loop"> { def ACC_Unknown : Directive<"unknown"> { let isDefault = 1; -} \ No newline at end of file +} diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h index 1a42d189db44a..8a4179167c894 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h @@ -70,15 +70,20 @@ TraitSelector getOpenMPContextTraitSelectorForProperty(TraitProperty Property); /// Return a textual representation of the trait selector \p Kind. StringRef getOpenMPContextTraitSelectorName(TraitSelector Kind); -/// Parse \p Str and return the trait set it matches or -/// TraitProperty::invalid. -TraitProperty getOpenMPContextTraitPropertyKind(TraitSet Set, StringRef Str); +/// Parse \p Str and return the trait property it matches in the set \p Set and +/// selector \p Selector or TraitProperty::invalid. +TraitProperty getOpenMPContextTraitPropertyKind(TraitSet Set, + TraitSelector Selector, + StringRef Str); /// Return the trait property for a singleton selector \p Selector. TraitProperty getOpenMPContextTraitPropertyForSelector(TraitSelector Selector); -/// Return a textual representation of the trait property \p Kind. -StringRef getOpenMPContextTraitPropertyName(TraitProperty Kind); +/// Return a textual representation of the trait property \p Kind, which might +/// be the raw string we parsed (\p RawString) if we do not translate the +/// property into a (distinct) enum. +StringRef getOpenMPContextTraitPropertyName(TraitProperty Kind, + StringRef RawString); /// Return a textual representation of the trait property \p Kind with selector /// and set name included. @@ -112,24 +117,36 @@ bool isValidTraitPropertyForTraitSetAndSelector(TraitProperty Property, /// scored (via the ScoresMap). In addition, the required consturct nesting is /// decribed as well. struct VariantMatchInfo { - /// Add the trait \p Property to the required trait set. If \p Score is not - /// null, it recorded as well. If \p Property is in the `construct` set it - /// is recorded in-order in the ConstructTraits as well. - void addTrait(TraitProperty Property, APInt *Score = nullptr) { - addTrait(getOpenMPContextTraitSetForProperty(Property), Property, Score); + /// Add the trait \p Property to the required trait set. \p RawString is the + /// string we parsed and derived \p Property from. If \p Score is not null, it + /// recorded as well. If \p Property is in the `construct` set it is recorded + /// in-order in the ConstructTraits as well. + void addTrait(TraitProperty Property, StringRef RawString, + APInt *Score = nullptr) { + addTrait(getOpenMPContextTraitSetForProperty(Property), Property, RawString, + Score); } /// Add the trait \p Property which is in set \p Set to the required trait - /// set. If \p Score is not null, it recorded as well. If \p Set is the - /// `construct` set it is recorded in-order in the ConstructTraits as well. - void addTrait(TraitSet Set, TraitProperty Property, APInt *Score = nullptr) { + /// set. \p RawString is the string we parsed and derived \p Property from. If + /// \p Score is not null, it recorded as well. If \p Set is the `construct` + /// set it is recorded in-order in the ConstructTraits as well. + void addTrait(TraitSet Set, TraitProperty Property, StringRef RawString, + APInt *Score = nullptr) { if (Score) ScoreMap[Property] = *Score; + + // Special handling for `device={isa(...)}` as we do not match the enum but + // the raw string. + if (Property == TraitProperty::device_isa___ANY) + ISATraits.push_back(RawString); + RequiredTraits.set(unsigned(Property)); if (Set == TraitSet::construct) ConstructTraits.push_back(Property); } BitVector RequiredTraits = BitVector(unsigned(TraitProperty::Last) + 1); + SmallVector ISATraits; SmallVector ConstructTraits; SmallDenseMap ScoreMap; }; @@ -139,6 +156,7 @@ struct VariantMatchInfo { /// in OpenMP constructs at the location. struct OMPContext { OMPContext(bool IsDeviceCompilation, Triple TargetTriple); + virtual ~OMPContext() = default; void addTrait(TraitProperty Property) { addTrait(getOpenMPContextTraitSetForProperty(Property), Property); @@ -149,6 +167,11 @@ struct OMPContext { ConstructTraits.push_back(Property); } + /// Hook for users to check if an ISA trait matches. The trait is described as + /// the string that got parsed and it depends on the target and context if + /// this matches or not. + virtual bool matchesISATrait(StringRef) const { return false; } + BitVector ActiveTraits = BitVector(unsigned(TraitProperty::Last) + 1); SmallVector ConstructTraits; }; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 95eed59f1b3d0..f813a730342e2 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -156,6 +156,7 @@ class OpenMPIRBuilder { /// Generator for '#omp parallel' /// /// \param Loc The insert and source location description. + /// \param AllocaIP The insertion points to be used for alloca instructions. /// \param BodyGenCB Callback that will generate the region code. /// \param PrivCB Callback to copy a given variable (think copy constructor). /// \param FiniCB Callback to finalize variable copies. @@ -166,10 +167,11 @@ class OpenMPIRBuilder { /// /// \returns The insertion position *after* the parallel. IRBuilder<>::InsertPoint - CreateParallel(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, - PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, - Value *IfCondition, Value *NumThreads, - omp::ProcBindKind ProcBind, bool IsCancellable); + CreateParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, + BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, + FinalizeCallbackTy FiniCB, Value *IfCondition, + Value *NumThreads, omp::ProcBindKind ProcBind, + bool IsCancellable); /// Generator for '#omp flush' /// diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 4f2fcb8af5d1d..3fc87dc34cd34 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -10,100 +10,12 @@ /// This file defines the list of supported OpenMP directives, clauses, runtime /// calls, and other things that need to be listed in enums. /// +/// This file is under transition to OMP.td with TableGen code generation. +/// //===----------------------------------------------------------------------===// /// OpenMP Directives and combined directives -/// -///{ - -#ifndef OMP_DIRECTIVE -#define OMP_DIRECTIVE(Enum, Str) -#endif - -#define __OMP_DIRECTIVE_EXT(Name, Str) OMP_DIRECTIVE(OMPD_##Name, Str) -#define __OMP_DIRECTIVE(Name) __OMP_DIRECTIVE_EXT(Name, #Name) - -__OMP_DIRECTIVE(threadprivate) -__OMP_DIRECTIVE(parallel) -__OMP_DIRECTIVE(task) -__OMP_DIRECTIVE(simd) -__OMP_DIRECTIVE(for) -__OMP_DIRECTIVE(sections) -__OMP_DIRECTIVE(section) -__OMP_DIRECTIVE(single) -__OMP_DIRECTIVE(master) -__OMP_DIRECTIVE(critical) -__OMP_DIRECTIVE(taskyield) -__OMP_DIRECTIVE(barrier) -__OMP_DIRECTIVE(taskwait) -__OMP_DIRECTIVE(taskgroup) -__OMP_DIRECTIVE(flush) -__OMP_DIRECTIVE(ordered) -__OMP_DIRECTIVE(atomic) -__OMP_DIRECTIVE(target) -__OMP_DIRECTIVE(teams) -__OMP_DIRECTIVE(cancel) -__OMP_DIRECTIVE(requires) -__OMP_DIRECTIVE_EXT(target_data, "target data") -__OMP_DIRECTIVE_EXT(target_enter_data, "target enter data") -__OMP_DIRECTIVE_EXT(target_exit_data, "target exit data") -__OMP_DIRECTIVE_EXT(target_parallel, "target parallel") -__OMP_DIRECTIVE_EXT(target_parallel_for, "target parallel for") -__OMP_DIRECTIVE_EXT(target_update, "target update") -__OMP_DIRECTIVE_EXT(parallel_for, "parallel for") -__OMP_DIRECTIVE_EXT(parallel_for_simd, "parallel for simd") -__OMP_DIRECTIVE_EXT(parallel_master, "parallel master") -__OMP_DIRECTIVE_EXT(parallel_sections, "parallel sections") -__OMP_DIRECTIVE_EXT(for_simd, "for simd") -__OMP_DIRECTIVE_EXT(cancellation_point, "cancellation point") -__OMP_DIRECTIVE_EXT(declare_reduction, "declare reduction") -__OMP_DIRECTIVE_EXT(declare_mapper, "declare mapper") -__OMP_DIRECTIVE_EXT(declare_simd, "declare simd") -__OMP_DIRECTIVE(taskloop) -__OMP_DIRECTIVE_EXT(taskloop_simd, "taskloop simd") -__OMP_DIRECTIVE(distribute) -__OMP_DIRECTIVE_EXT(declare_target, "declare target") -__OMP_DIRECTIVE_EXT(end_declare_target, "end declare target") -__OMP_DIRECTIVE_EXT(distribute_parallel_for, "distribute parallel for") -__OMP_DIRECTIVE_EXT(distribute_parallel_for_simd, - "distribute parallel for simd") -__OMP_DIRECTIVE_EXT(distribute_simd, "distribute simd") -__OMP_DIRECTIVE_EXT(target_parallel_for_simd, "target parallel for simd") -__OMP_DIRECTIVE_EXT(target_simd, "target simd") -__OMP_DIRECTIVE_EXT(teams_distribute, "teams distribute") -__OMP_DIRECTIVE_EXT(teams_distribute_simd, "teams distribute simd") -__OMP_DIRECTIVE_EXT(teams_distribute_parallel_for_simd, - "teams distribute parallel for simd") -__OMP_DIRECTIVE_EXT(teams_distribute_parallel_for, - "teams distribute parallel for") -__OMP_DIRECTIVE_EXT(target_teams, "target teams") -__OMP_DIRECTIVE_EXT(target_teams_distribute, "target teams distribute") -__OMP_DIRECTIVE_EXT(target_teams_distribute_parallel_for, - "target teams distribute parallel for") -__OMP_DIRECTIVE_EXT(target_teams_distribute_parallel_for_simd, - "target teams distribute parallel for simd") -__OMP_DIRECTIVE_EXT(target_teams_distribute_simd, - "target teams distribute simd") -__OMP_DIRECTIVE(allocate) -__OMP_DIRECTIVE_EXT(declare_variant, "declare variant") -__OMP_DIRECTIVE_EXT(master_taskloop, "master taskloop") -__OMP_DIRECTIVE_EXT(parallel_master_taskloop, "parallel master taskloop") -__OMP_DIRECTIVE_EXT(master_taskloop_simd, "master taskloop simd") -__OMP_DIRECTIVE_EXT(parallel_master_taskloop_simd, - "parallel master taskloop simd") -__OMP_DIRECTIVE(depobj) -__OMP_DIRECTIVE(scan) -__OMP_DIRECTIVE_EXT(begin_declare_variant, "begin declare variant") -__OMP_DIRECTIVE_EXT(end_declare_variant, "end declare variant") - -// Has to be the last because Clang implicitly expects it to be. -__OMP_DIRECTIVE(unknown) - -#undef __OMP_DIRECTIVE_EXT -#undef __OMP_DIRECTIVE -#undef OMP_DIRECTIVE - -///} +/// - Moved to OMP.td /// OpenMP Clauses /// @@ -1159,7 +1071,11 @@ __OMP_TRAIT_PROPERTY(device, kind, any) __OMP_TRAIT_SELECTOR(device, isa, true) -// TODO: What do we want for ISA? +// We use "__ANY" as a placeholder in the isa property to denote the +// conceptual "any", not the literal `any` used in kind. The string we +// we use is not important except that it will show up in diagnostics. +OMP_TRAIT_PROPERTY(device_isa___ANY, device, device_isa, + "") __OMP_TRAIT_SELECTOR(device, arch, true) diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h index 8ecb9aa0ce020..318532b24e838 100644 --- a/llvm/include/llvm/IR/ConstantRange.h +++ b/llvm/include/llvm/IR/ConstantRange.h @@ -150,6 +150,14 @@ class LLVM_NODISCARD ConstantRange { const APInt &Other, unsigned NoWrapKind); + /// Returns true if ConstantRange calculations are supported for intrinsic + /// with \p IntrinsicID. + static bool isIntrinsicSupported(Intrinsic::ID IntrinsicID); + + /// Compute range of intrinsic result for the given operand ranges. + static ConstantRange intrinsic(Intrinsic::ID IntrinsicID, + ArrayRef Ops); + /// Set up \p Pred and \p RHS such that /// ConstantRange::makeExactICmpRegion(Pred, RHS) == *this. Return true if /// successful. @@ -456,8 +464,9 @@ class LLVM_NODISCARD ConstantRange { ConstantRange inverse() const; /// Calculate absolute value range. If the original range contains signed - /// min, then the resulting range will also contain signed min. - ConstantRange abs() const; + /// min, then the resulting range will contain signed min if and only if + /// \p IntMinIsPoison is false. + ConstantRange abs(bool IntMinIsPoison = false) const; /// Represents whether an operation on the given constant range is known to /// always or never overflow. diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h index 71595cb15df48..7b34309cb1319 100644 --- a/llvm/include/llvm/IR/Dominators.h +++ b/llvm/include/llvm/IR/Dominators.h @@ -44,6 +44,9 @@ using BBPostDomTree = PostDomTreeBase; using BBUpdates = ArrayRef>; +using BBDomTreeGraphDiff = GraphDiff; +using BBPostDomTreeGraphDiff = GraphDiff; + extern template void Calculate(BBDomTree &DT); extern template void CalculateWithUpdates(BBDomTree &DT, BBUpdates U); @@ -62,8 +65,10 @@ extern template void DeleteEdge(BBPostDomTree &DT, BasicBlock *From, BasicBlock *To); -extern template void ApplyUpdates(BBDomTree &DT, BBUpdates); -extern template void ApplyUpdates(BBPostDomTree &DT, BBUpdates); +extern template void ApplyUpdates(BBDomTree &DT, + BBDomTreeGraphDiff &); +extern template void ApplyUpdates(BBPostDomTree &DT, + BBPostDomTreeGraphDiff &); extern template bool Verify(const BBDomTree &DT, BBDomTree::VerificationLevel VL); diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index ffec4ff64ca66..aa0286ab11f94 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -294,8 +294,7 @@ class IRBuilderBase { } void setConstrainedFPCallAttr(CallInst *I) { - if (!I->hasFnAttr(Attribute::StrictFP)) - I->addAttribute(AttributeList::FunctionIndex, Attribute::StrictFP); + I->addAttribute(AttributeList::FunctionIndex, Attribute::StrictFP); } void setDefaultOperandBundles(ArrayRef OpBundles) { @@ -2484,6 +2483,10 @@ class IRBuilderBase { /// NumElts elements. Value *CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name = ""); + /// Return a vector value that contains \arg V broadcasted to \p + /// EC elements. + Value *CreateVectorSplat(ElementCount EC, Value *V, const Twine &Name = ""); + /// Return a value that has been extracted from a larger integer type. Value *CreateExtractInteger(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, diff --git a/llvm/include/llvm/IR/IRPrintingPasses.h b/llvm/include/llvm/IR/IRPrintingPasses.h index 3a1c489ee09f2..ed7082a31a423 100644 --- a/llvm/include/llvm/IR/IRPrintingPasses.h +++ b/llvm/include/llvm/IR/IRPrintingPasses.h @@ -18,11 +18,12 @@ #ifndef LLVM_IR_IRPRINTINGPASSES_H #define LLVM_IR_IRPRINTINGPASSES_H -#include "llvm/ADT/StringRef.h" #include "llvm/IR/PassManager.h" #include namespace llvm { +class raw_ostream; +class StringRef; /// Create and return a pass that writes the module to the specified /// \c raw_ostream. diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 0afc585dfbe5a..63194fa93cbca 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -27,6 +27,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -2743,6 +2744,15 @@ class PHINode : public Instruction { /// non-undef value. bool hasConstantOrUndefValue() const; + /// If the PHI node is complete which means all of its parent's predecessors + /// have incoming value in this PHI, return true, otherwise return false. + bool isComplete() const { + return llvm::all_of(predecessors(getParent()), + [this](const BasicBlock *Pred) { + return getBasicBlockIndex(Pred) >= 0; + }); + } + /// Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Instruction *I) { return I->getOpcode() == Instruction::PHI; diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td index 23bcf3ce1959c..853d26c67ee3d 100644 --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1109,182 +1109,6 @@ def int_ppc_vsx_xxblendvd: GCCBuiltin<"__builtin_vsx_xxblendvd">, [IntrNoMem]>; } -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsics. -// - -let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.". - /// PowerPC_QPX_Intrinsic - Base class for all QPX intrinsics. - class PowerPC_QPX_Intrinsic ret_types, - list param_types, - list properties> - : GCCBuiltin, - Intrinsic; -} - -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsic Class Definitions. -// - -/// PowerPC_QPX_FF_Intrinsic - A PowerPC intrinsic that takes one v4f64 -/// vector and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f64 -/// vectors and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FFF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_FFFF_Intrinsic - A PowerPC intrinsic that takes three v4f64 -/// vectors and returns one. These intrinsics have no side effects. -class PowerPC_QPX_FFFF_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_Load_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and returns a v4f64. -class PowerPC_QPX_Load_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_LoadPerm_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and returns a v4f64 permutation. -class PowerPC_QPX_LoadPerm_Intrinsic - : PowerPC_QPX_Intrinsic; - -/// PowerPC_QPX_Store_Intrinsic - A PowerPC intrinsic that takes a pointer -/// and stores a v4f64. -class PowerPC_QPX_Store_Intrinsic - : PowerPC_QPX_Intrinsic; - -//===----------------------------------------------------------------------===// -// PowerPC QPX Intrinsic Definitions. - -let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.". - // Add Instructions - def int_ppc_qpx_qvfadd : PowerPC_QPX_FFF_Intrinsic<"qvfadd">; - def int_ppc_qpx_qvfadds : PowerPC_QPX_FFF_Intrinsic<"qvfadds">; - def int_ppc_qpx_qvfsub : PowerPC_QPX_FFF_Intrinsic<"qvfsub">; - def int_ppc_qpx_qvfsubs : PowerPC_QPX_FFF_Intrinsic<"qvfsubs">; - - // Estimate Instructions - def int_ppc_qpx_qvfre : PowerPC_QPX_FF_Intrinsic<"qvfre">; - def int_ppc_qpx_qvfres : PowerPC_QPX_FF_Intrinsic<"qvfres">; - def int_ppc_qpx_qvfrsqrte : PowerPC_QPX_FF_Intrinsic<"qvfrsqrte">; - def int_ppc_qpx_qvfrsqrtes : PowerPC_QPX_FF_Intrinsic<"qvfrsqrtes">; - - // Multiply Instructions - def int_ppc_qpx_qvfmul : PowerPC_QPX_FFF_Intrinsic<"qvfmul">; - def int_ppc_qpx_qvfmuls : PowerPC_QPX_FFF_Intrinsic<"qvfmuls">; - def int_ppc_qpx_qvfxmul : PowerPC_QPX_FFF_Intrinsic<"qvfxmul">; - def int_ppc_qpx_qvfxmuls : PowerPC_QPX_FFF_Intrinsic<"qvfxmuls">; - - // Multiply-add instructions - def int_ppc_qpx_qvfmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfmadd">; - def int_ppc_qpx_qvfmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfmadds">; - def int_ppc_qpx_qvfnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadd">; - def int_ppc_qpx_qvfnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfnmadds">; - def int_ppc_qpx_qvfmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfmsub">; - def int_ppc_qpx_qvfmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfmsubs">; - def int_ppc_qpx_qvfnmsub : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsub">; - def int_ppc_qpx_qvfnmsubs : PowerPC_QPX_FFFF_Intrinsic<"qvfnmsubs">; - def int_ppc_qpx_qvfxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadd">; - def int_ppc_qpx_qvfxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxmadds">; - def int_ppc_qpx_qvfxxnpmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadd">; - def int_ppc_qpx_qvfxxnpmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxnpmadds">; - def int_ppc_qpx_qvfxxcpnmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadd">; - def int_ppc_qpx_qvfxxcpnmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxcpnmadds">; - def int_ppc_qpx_qvfxxmadd : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadd">; - def int_ppc_qpx_qvfxxmadds : PowerPC_QPX_FFFF_Intrinsic<"qvfxxmadds">; - - // Select Instruction - def int_ppc_qpx_qvfsel : PowerPC_QPX_FFFF_Intrinsic<"qvfsel">; - - // Permute Instruction - def int_ppc_qpx_qvfperm : PowerPC_QPX_FFFF_Intrinsic<"qvfperm">; - - // Convert and Round Instructions - def int_ppc_qpx_qvfctid : PowerPC_QPX_FF_Intrinsic<"qvfctid">; - def int_ppc_qpx_qvfctidu : PowerPC_QPX_FF_Intrinsic<"qvfctidu">; - def int_ppc_qpx_qvfctidz : PowerPC_QPX_FF_Intrinsic<"qvfctidz">; - def int_ppc_qpx_qvfctiduz : PowerPC_QPX_FF_Intrinsic<"qvfctiduz">; - def int_ppc_qpx_qvfctiw : PowerPC_QPX_FF_Intrinsic<"qvfctiw">; - def int_ppc_qpx_qvfctiwu : PowerPC_QPX_FF_Intrinsic<"qvfctiwu">; - def int_ppc_qpx_qvfctiwz : PowerPC_QPX_FF_Intrinsic<"qvfctiwz">; - def int_ppc_qpx_qvfctiwuz : PowerPC_QPX_FF_Intrinsic<"qvfctiwuz">; - def int_ppc_qpx_qvfcfid : PowerPC_QPX_FF_Intrinsic<"qvfcfid">; - def int_ppc_qpx_qvfcfidu : PowerPC_QPX_FF_Intrinsic<"qvfcfidu">; - def int_ppc_qpx_qvfcfids : PowerPC_QPX_FF_Intrinsic<"qvfcfids">; - def int_ppc_qpx_qvfcfidus : PowerPC_QPX_FF_Intrinsic<"qvfcfidus">; - def int_ppc_qpx_qvfrsp : PowerPC_QPX_FF_Intrinsic<"qvfrsp">; - def int_ppc_qpx_qvfriz : PowerPC_QPX_FF_Intrinsic<"qvfriz">; - def int_ppc_qpx_qvfrin : PowerPC_QPX_FF_Intrinsic<"qvfrin">; - def int_ppc_qpx_qvfrip : PowerPC_QPX_FF_Intrinsic<"qvfrip">; - def int_ppc_qpx_qvfrim : PowerPC_QPX_FF_Intrinsic<"qvfrim">; - - // Move Instructions - def int_ppc_qpx_qvfneg : PowerPC_QPX_FF_Intrinsic<"qvfneg">; - def int_ppc_qpx_qvfabs : PowerPC_QPX_FF_Intrinsic<"qvfabs">; - def int_ppc_qpx_qvfnabs : PowerPC_QPX_FF_Intrinsic<"qvfnabs">; - def int_ppc_qpx_qvfcpsgn : PowerPC_QPX_FFF_Intrinsic<"qvfcpsgn">; - - // Compare Instructions - def int_ppc_qpx_qvftstnan : PowerPC_QPX_FFF_Intrinsic<"qvftstnan">; - def int_ppc_qpx_qvfcmplt : PowerPC_QPX_FFF_Intrinsic<"qvfcmplt">; - def int_ppc_qpx_qvfcmpgt : PowerPC_QPX_FFF_Intrinsic<"qvfcmpgt">; - def int_ppc_qpx_qvfcmpeq : PowerPC_QPX_FFF_Intrinsic<"qvfcmpeq">; - - // Load instructions - def int_ppc_qpx_qvlfd : PowerPC_QPX_Load_Intrinsic<"qvlfd">; - def int_ppc_qpx_qvlfda : PowerPC_QPX_Load_Intrinsic<"qvlfda">; - def int_ppc_qpx_qvlfs : PowerPC_QPX_Load_Intrinsic<"qvlfs">; - def int_ppc_qpx_qvlfsa : PowerPC_QPX_Load_Intrinsic<"qvlfsa">; - - def int_ppc_qpx_qvlfcda : PowerPC_QPX_Load_Intrinsic<"qvlfcda">; - def int_ppc_qpx_qvlfcd : PowerPC_QPX_Load_Intrinsic<"qvlfcd">; - def int_ppc_qpx_qvlfcsa : PowerPC_QPX_Load_Intrinsic<"qvlfcsa">; - def int_ppc_qpx_qvlfcs : PowerPC_QPX_Load_Intrinsic<"qvlfcs">; - def int_ppc_qpx_qvlfiwaa : PowerPC_QPX_Load_Intrinsic<"qvlfiwaa">; - def int_ppc_qpx_qvlfiwa : PowerPC_QPX_Load_Intrinsic<"qvlfiwa">; - def int_ppc_qpx_qvlfiwza : PowerPC_QPX_Load_Intrinsic<"qvlfiwza">; - def int_ppc_qpx_qvlfiwz : PowerPC_QPX_Load_Intrinsic<"qvlfiwz">; - - def int_ppc_qpx_qvlpcld : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcld">; - def int_ppc_qpx_qvlpcls : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcls">; - def int_ppc_qpx_qvlpcrd : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrd">; - def int_ppc_qpx_qvlpcrs : PowerPC_QPX_LoadPerm_Intrinsic<"qvlpcrs">; - - // Store instructions - def int_ppc_qpx_qvstfd : PowerPC_QPX_Store_Intrinsic<"qvstfd">; - def int_ppc_qpx_qvstfda : PowerPC_QPX_Store_Intrinsic<"qvstfda">; - def int_ppc_qpx_qvstfs : PowerPC_QPX_Store_Intrinsic<"qvstfs">; - def int_ppc_qpx_qvstfsa : PowerPC_QPX_Store_Intrinsic<"qvstfsa">; - - def int_ppc_qpx_qvstfcda : PowerPC_QPX_Store_Intrinsic<"qvstfcda">; - def int_ppc_qpx_qvstfcd : PowerPC_QPX_Store_Intrinsic<"qvstfcd">; - def int_ppc_qpx_qvstfcsa : PowerPC_QPX_Store_Intrinsic<"qvstfcsa">; - def int_ppc_qpx_qvstfcs : PowerPC_QPX_Store_Intrinsic<"qvstfcs">; - def int_ppc_qpx_qvstfiwa : PowerPC_QPX_Store_Intrinsic<"qvstfiwa">; - def int_ppc_qpx_qvstfiw : PowerPC_QPX_Store_Intrinsic<"qvstfiw">; - - // Logical and permutation formation - def int_ppc_qpx_qvflogical : PowerPC_QPX_Intrinsic<"qvflogical", - [llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_ppc_qpx_qvgpci : PowerPC_QPX_Intrinsic<"qvgpci", - [llvm_v4f64_ty], [llvm_i32_ty], [IntrNoMem]>; -} - //===----------------------------------------------------------------------===// // PowerPC HTM Intrinsic Definitions. diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td index 7c9ceb148a472..627a579ae5065 100644 --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -159,22 +159,6 @@ def int_wasm_narrow_unsigned : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_low_signed : - Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_high_signed : - Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_low_unsigned : - Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_high_unsigned : - Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem, IntrSpeculatable]>; // TODO: Replace these intrinsics with normal ISel patterns def int_wasm_pmin : @@ -206,6 +190,20 @@ def int_wasm_nearest : [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; +// TODO: Replace these intrinsic with normal ISel patterns once the +// load_zero instructions are merged to the proposal. +def int_wasm_load32_zero : + Intrinsic<[llvm_v4i32_ty], + [LLVMPointerType], + [IntrReadMem, IntrArgMemOnly], + "", [SDNPMemOperand]>; + +def int_wasm_load64_zero : + Intrinsic<[llvm_v2i64_ty], + [LLVMPointerType], + [IntrReadMem, IntrArgMemOnly], + "", [SDNPMemOperand]>; + //===----------------------------------------------------------------------===// // Thread-local storage intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/PassInstrumentation.h b/llvm/include/llvm/IR/PassInstrumentation.h index 37390e4e682ba..4df073c91766d 100644 --- a/llvm/include/llvm/IR/PassInstrumentation.h +++ b/llvm/include/llvm/IR/PassInstrumentation.h @@ -44,10 +44,6 @@ /// of a pass. For those callbacks returning false means pass will not be /// executed. /// -/// TODO: currently there is no way for a pass to opt-out of execution control -/// (e.g. become unskippable). PassManager is the only entity that determines -/// how pass instrumentation affects pass execution. -/// //===----------------------------------------------------------------------===// #ifndef LLVM_IR_PASSINSTRUMENTATION_H @@ -71,11 +67,14 @@ class PassInstrumentationCallbacks { // to take them as constant pointers, wrapped with llvm::Any. // For the case when IRUnit has been invalidated there is a different // callback to use - AfterPassInvalidated. + // We call all BeforePassFuncs to determine if a pass should run or not. + // BeforeNonSkippedPassFuncs are called only if the pass should run. // TODO: currently AfterPassInvalidated does not accept IRUnit, since passing - // already invalidated IRUnit is unsafe. There are ways to handle invalidated IRUnits - // in a safe way, and we might pursue that as soon as there is a useful instrumentation - // that needs it. + // already invalidated IRUnit is unsafe. There are ways to handle invalidated + // IRUnits in a safe way, and we might pursue that as soon as there is a + // useful instrumentation that needs it. using BeforePassFunc = bool(StringRef, Any); + using BeforeNonSkippedPassFunc = void(StringRef, Any); using AfterPassFunc = void(StringRef, Any); using AfterPassInvalidatedFunc = void(StringRef); using BeforeAnalysisFunc = void(StringRef, Any); @@ -92,6 +91,11 @@ class PassInstrumentationCallbacks { BeforePassCallbacks.emplace_back(std::move(C)); } + template + void registerBeforeNonSkippedPassCallback(CallableT C) { + BeforeNonSkippedPassCallbacks.emplace_back(std::move(C)); + } + template void registerAfterPassCallback(CallableT C) { AfterPassCallbacks.emplace_back(std::move(C)); } @@ -115,6 +119,8 @@ class PassInstrumentationCallbacks { friend class PassInstrumentation; SmallVector, 4> BeforePassCallbacks; + SmallVector, 4> + BeforeNonSkippedPassCallbacks; SmallVector, 4> AfterPassCallbacks; SmallVector, 4> AfterPassInvalidatedCallbacks; @@ -169,6 +175,12 @@ class PassInstrumentation { for (auto &C : Callbacks->BeforePassCallbacks) ShouldRun &= C(Pass.name(), llvm::Any(&IR)); ShouldRun = ShouldRun || isRequired(Pass); + + if (ShouldRun) { + for (auto &C : Callbacks->BeforeNonSkippedPassCallbacks) + C(Pass.name(), llvm::Any(&IR)); + } + return ShouldRun; } @@ -222,6 +234,8 @@ class PassInstrumentation { } }; +bool isSpecialPass(StringRef PassID, const std::vector &Specials); + } // namespace llvm #endif diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h index c2f535037a56d..f16696d7c2e38 100644 --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -38,6 +38,7 @@ #define LLVM_IR_PASSMANAGER_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" @@ -510,10 +511,6 @@ class PassManager : public PassInfoMixin< if (!PI.runBeforePass(*P, IR)) continue; - if (DebugLogging) - dbgs() << "Running pass: " << P->name() << " on " << IR.getName() - << "\n"; - PreservedAnalyses PassPA; { TimeTraceScope TimeScope(P->name(), IR.getName()); @@ -651,7 +648,7 @@ template class AnalysisManager { /// when any of its embedded analysis results end up invalidated. We pass an /// \c Invalidator object as an argument to \c invalidate() in order to let /// the analysis results themselves define the dependency graph on the fly. - /// This lets us avoid building building an explicit representation of the + /// This lets us avoid building an explicit representation of the /// dependencies between analysis results. class Invalidator { public: @@ -846,7 +843,7 @@ template class AnalysisManager { return true; } - /// Invalidate a specific analysis pass for an IR module. + /// Invalidate a specific analysis pass for an IR unit. /// /// Note that the analysis result can disregard invalidation, if it determines /// it is in fact still valid. @@ -890,7 +887,7 @@ template class AnalysisManager { return RI == AnalysisResults.end() ? nullptr : &*RI->second->second; } - /// Invalidate a function pass result. + /// Invalidate a pass result for a IR unit. void invalidateImpl(AnalysisKey *ID, IRUnitT &IR) { typename AnalysisResultMapT::iterator RI = AnalysisResults.find({ID, &IR}); @@ -904,20 +901,20 @@ template class AnalysisManager { AnalysisResults.erase(RI); } - /// Map type from module analysis pass ID to pass concept pointer. + /// Map type from analysis pass ID to pass concept pointer. using AnalysisPassMapT = DenseMap>; - /// Collection of module analysis passes, indexed by ID. + /// Collection of analysis passes, indexed by ID. AnalysisPassMapT AnalysisPasses; - /// Map from function to a list of function analysis results. + /// Map from IR unit to a list of analysis results. /// - /// Provides linear time removal of all analysis results for a function and + /// Provides linear time removal of all analysis results for a IR unit and /// the ultimate storage for a particular cached analysis result. AnalysisResultListMapT AnalysisResultLists; - /// Map from an analysis ID and function to a particular cached + /// Map from an analysis ID and IR unit to a particular cached /// analysis result. AnalysisResultMapT AnalysisResults; @@ -1141,9 +1138,7 @@ class OuterAnalysisManagerProxy // analyses that all trigger invalidation on the same outer analysis, // this entire system should be changed to some other deterministic // data structure such as a `SetVector` of a pair of pointers. - auto InvalidatedIt = std::find(InvalidatedIDList.begin(), - InvalidatedIDList.end(), InvalidatedID); - if (InvalidatedIt == InvalidatedIDList.end()) + if (!llvm::is_contained(InvalidatedIDList, InvalidatedID)) InvalidatedIDList.push_back(InvalidatedID); } diff --git a/llvm/include/llvm/IR/PassManagerImpl.h b/llvm/include/llvm/IR/PassManagerImpl.h index 978655ac69c46..71a86d1efb157 100644 --- a/llvm/include/llvm/IR/PassManagerImpl.h +++ b/llvm/include/llvm/IR/PassManagerImpl.h @@ -64,9 +64,6 @@ AnalysisManager::getResultImpl( // run it to produce a result, which we then add to the cache. if (Inserted) { auto &P = this->lookUpPass(ID); - if (DebugLogging) - dbgs() << "Running analysis: " << P.name() << " on " << IR.getName() - << "\n"; PassInstrumentation PI; if (ID != PassInstrumentationAnalysis::ID()) { @@ -97,10 +94,6 @@ inline void AnalysisManager::invalidate( if (PA.allAnalysesInSetPreserved>()) return; - if (DebugLogging) - dbgs() << "Invalidating all non-preserved analyses for: " << IR.getName() - << "\n"; - // Track whether each analysis's result is invalidated in // IsResultInvalidated. SmallDenseMap IsResultInvalidated; diff --git a/llvm/include/llvm/IR/PassTimingInfo.h b/llvm/include/llvm/IR/PassTimingInfo.h index a42d6aa3bede6..a3608971217bd 100644 --- a/llvm/include/llvm/IR/PassTimingInfo.h +++ b/llvm/include/llvm/IR/PassTimingInfo.h @@ -100,7 +100,7 @@ class TimePassesHandler { void stopTimer(StringRef PassID); // Implementation of pass instrumentation callbacks. - bool runBeforePass(StringRef PassID); + void runBeforePass(StringRef PassID); void runAfterPass(StringRef PassID); }; diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h index 04ca682746264..eec96373a1a9f 100644 --- a/llvm/include/llvm/IR/Value.h +++ b/llvm/include/llvm/IR/Value.h @@ -470,6 +470,12 @@ class Value { void dropDroppableUses(llvm::function_ref ShouldDrop = [](const Use *) { return true; }); + /// Remove every use of this value in \p User that can safely be removed. + void dropDroppableUsesIn(User &Usr); + + /// Remove the droppable use \p U. + void dropDroppableUse(Use &U); + /// Check if this value is used in the specified basic block. bool isUsedInBasicBlock(const BasicBlock *BB) const; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index b43902ff8a551..d1e3ec2a79f97 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -122,7 +122,7 @@ void initializeDAEPass(PassRegistry&); void initializeDAHPass(PassRegistry&); void initializeDCELegacyPassPass(PassRegistry&); void initializeDSELegacyPassPass(PassRegistry&); -void initializeDataFlowSanitizerPass(PassRegistry&); +void initializeDataFlowSanitizerLegacyPassPass(PassRegistry &); void initializeDeadInstEliminationPass(PassRegistry&); void initializeDeadMachineInstructionElimPass(PassRegistry&); void initializeDebugifyMachineModulePass(PassRegistry &); @@ -179,7 +179,6 @@ void initializeGuardWideningLegacyPassPass(PassRegistry&); void initializeHardwareLoopsPass(PassRegistry&); void initializeHotColdSplittingLegacyPassPass(PassRegistry&); void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &); -void initializeIPCPPass(PassRegistry&); void initializeIPSCCPLegacyPassPass(PassRegistry&); void initializeIRCELegacyPassPass(PassRegistry&); void initializeIRTranslatorPass(PassRegistry&); @@ -255,7 +254,7 @@ void initializeLoopUnrollPass(PassRegistry&); void initializeLoopUnswitchPass(PassRegistry&); void initializeLoopVectorizePass(PassRegistry&); void initializeLoopVersioningLICMPass(PassRegistry&); -void initializeLoopVersioningPassPass(PassRegistry&); +void initializeLoopVersioningLegacyPassPass(PassRegistry &); void initializeLowerAtomicLegacyPassPass(PassRegistry&); void initializeLowerConstantIntrinsicsPass(PassRegistry&); void initializeLowerEmuTLSPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index c50459347643d..380799a63bcea 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -118,7 +118,6 @@ namespace { (void) llvm::createGlobalsAAWrapperPass(); (void) llvm::createGuardWideningPass(); (void) llvm::createLoopGuardWideningPass(); - (void) llvm::createIPConstantPropagationPass(); (void) llvm::createIPSCCPPass(); (void) llvm::createInductiveRangeCheckEliminationPass(); (void) llvm::createIndVarSimplifyPass(); diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h index cc9f42023bc29..94ed3d27e7859 100644 --- a/llvm/include/llvm/MC/MCAsmBackend.h +++ b/llvm/include/llvm/MC/MCAsmBackend.h @@ -144,7 +144,9 @@ class MCAsmBackend { /// \param STI - The MCSubtargetInfo in effect when the instruction was /// encoded. virtual bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const = 0; + const MCSubtargetInfo &STI) const { + return false; + } /// Target specific predicate for whether a given fixup requires the /// associated instruction to be relaxed. @@ -175,6 +177,10 @@ class MCAsmBackend { /// virtual unsigned getMinimumNopSize() const { return 1; } + /// Returns the maximum size of a nop in bytes on this target. + /// + virtual unsigned getMaximumNopSize() const { return 0; } + /// Write an (optimal) nop sequence of Count bytes to the given output. If the /// target cannot generate such a sequence, it should return an error. /// diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h index 45be9bb3d225b..d041b06c5568c 100644 --- a/llvm/include/llvm/MC/MCContext.h +++ b/llvm/include/llvm/MC/MCContext.h @@ -97,6 +97,7 @@ namespace llvm { SpecificBumpPtrAllocator MachOAllocator; SpecificBumpPtrAllocator WasmAllocator; SpecificBumpPtrAllocator XCOFFAllocator; + SpecificBumpPtrAllocator MCInstAllocator; /// Bindings of names to symbols. SymbolTable Symbols; @@ -380,6 +381,11 @@ namespace llvm { /// @} + /// \name McInst Management + + /// Create and return a new MC instruction. + MCInst *createMCInst(); + /// \name Symbol Management /// @{ diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h index e3cea0ae64cf3..70da5f76e7665 100644 --- a/llvm/include/llvm/MC/MCDwarf.h +++ b/llvm/include/llvm/MC/MCDwarf.h @@ -467,10 +467,12 @@ class MCCFIInstruction { unsigned Register2; }; std::vector Values; + std::string Comment; - MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V) + MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V, + StringRef Comment = "") : Operation(Op), Label(L), Register(R), Offset(O), - Values(V.begin(), V.end()) { + Values(V.begin(), V.end()), Comment(Comment) { assert(Op != OpRegister); } @@ -570,8 +572,9 @@ class MCCFIInstruction { /// .cfi_escape Allows the user to add arbitrary bytes to the unwind /// info. - static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals) { - return MCCFIInstruction(OpEscape, L, 0, 0, Vals); + static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, + StringRef Comment = "") { + return MCCFIInstruction(OpEscape, L, 0, 0, Vals, Comment); } /// A special wrapper for .cfi_escape that indicates GNU_ARGS_SIZE @@ -606,6 +609,10 @@ class MCCFIInstruction { assert(Operation == OpEscape); return StringRef(&Values[0], Values.size()); } + + StringRef getComment() const { + return Comment; + } }; struct MCDwarfFrameInfo { diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index a424957bb0b22..843de5d5be92e 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -500,6 +500,7 @@ class MCBinaryExpr : public MCExpr { Mul, ///< Multiplication. NE, ///< Inequality comparison. Or, ///< Bitwise or. + OrNot, ///< Bitwise or not. Shl, ///< Shift left. AShr, ///< Arithmetic shift right. LShr, ///< Logical shift right. diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h index fb7166e82c098..87338ab46cc2a 100644 --- a/llvm/include/llvm/MC/MCFragment.h +++ b/llvm/include/llvm/MC/MCFragment.h @@ -37,6 +37,7 @@ class MCFragment : public ilist_node_with_parent { FT_Data, FT_CompactEncodedInst, FT_Fill, + FT_Nops, FT_Relaxable, FT_Org, FT_Dwarf, @@ -350,6 +351,31 @@ class MCFillFragment : public MCFragment { } }; +class MCNopsFragment : public MCFragment { + /// The number of bytes to insert. + int64_t Size; + /// Maximum number of bytes allowed in each NOP instruction. + int64_t ControlledNopLength; + + /// Source location of the directive that this fragment was created for. + SMLoc Loc; + +public: + MCNopsFragment(int64_t NumBytes, int64_t ControlledNopLength, SMLoc L, + MCSection *Sec = nullptr) + : MCFragment(FT_Nops, false, Sec), Size(NumBytes), + ControlledNopLength(ControlledNopLength), Loc(L) {} + + int64_t getNumBytes() const { return Size; } + int64_t getControlledNopLength() const { return ControlledNopLength; } + + SMLoc getLoc() const { return Loc; } + + static bool classof(const MCFragment *F) { + return F->getKind() == MCFragment::FT_Nops; + } +}; + class MCOrgFragment : public MCFragment { /// Value to use for filling bytes. int8_t Value; diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h index c3f3ae5de921e..a00000bc11b60 100644 --- a/llvm/include/llvm/MC/MCObjectStreamer.h +++ b/llvm/include/llvm/MC/MCObjectStreamer.h @@ -179,6 +179,8 @@ class MCObjectStreamer : public MCStreamer { SMLoc Loc = SMLoc()) override; void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr, SMLoc Loc = SMLoc()) override; + void emitNops(int64_t NumBytes, int64_t ControlledNopLength, + SMLoc Loc) override; void emitFileDirective(StringRef Filename) override; void emitAddrsig() override; diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index 484c62538366e..63a4c1d190aca 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -777,6 +777,9 @@ class MCStreamer { virtual void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr, SMLoc Loc = SMLoc()); + virtual void emitNops(int64_t NumBytes, int64_t ControlledNopLength, + SMLoc Loc); + /// Emit NumBytes worth of zeros. /// This function properly handles data in virtual sections. void emitZeros(uint64_t NumBytes); diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h index 3dab0471a2391..0080d0ba58f34 100644 --- a/llvm/include/llvm/Object/ELF.h +++ b/llvm/include/llvm/Object/ELF.h @@ -50,6 +50,7 @@ static inline Error createError(const Twine &Err) { enum PPCInstrMasks : uint64_t { PADDI_R12_NO_DISP = 0x0610000039800000, + PLD_R12_NO_DISP = 0x04100000E5800000, MTCTR_R12 = 0x7D8903A6, BCTR = 0x4E800420, }; diff --git a/llvm/include/llvm/Object/MachOUniversalWriter.h b/llvm/include/llvm/Object/MachOUniversalWriter.h new file mode 100644 index 0000000000000..c860495ddd6f0 --- /dev/null +++ b/llvm/include/llvm/Object/MachOUniversalWriter.h @@ -0,0 +1,84 @@ +//===- MachOUniversalWriter.h - MachO universal binary writer----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declares the Slice class and writeUniversalBinary function for writing a +// MachO universal binary file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECT_MACHOUNIVERSALWRITER_H +#define LLVM_OBJECT_MACHOUNIVERSALWRITER_H + +#include "llvm/Object/Archive.h" +#include "llvm/Object/Binary.h" +#include "llvm/Object/MachO.h" + +namespace llvm { +namespace object { + +class Slice { + const Binary *B; + uint32_t CPUType; + uint32_t CPUSubType; + std::string ArchName; + + // P2Alignment field stores slice alignment values from universal + // binaries. This is also needed to order the slices so the total + // file size can be calculated before creating the output buffer. + uint32_t P2Alignment; + +public: + explicit Slice(const MachOObjectFile &O); + + Slice(const MachOObjectFile &O, uint32_t Align); + + static Expected create(const Archive *A); + + void setP2Alignment(uint32_t Align) { P2Alignment = Align; } + + const Binary *getBinary() const { return B; } + + uint32_t getCPUType() const { return CPUType; } + + uint32_t getCPUSubType() const { return CPUSubType; } + + uint32_t getP2Alignment() const { return P2Alignment; } + + uint64_t getCPUID() const { + return static_cast(CPUType) << 32 | CPUSubType; + } + + std::string getArchString() const { + if (!ArchName.empty()) + return ArchName; + return ("unknown(" + Twine(CPUType) + "," + + Twine(CPUSubType & ~MachO::CPU_SUBTYPE_MASK) + ")") + .str(); + } + + friend bool operator<(const Slice &Lhs, const Slice &Rhs) { + if (Lhs.CPUType == Rhs.CPUType) + return Lhs.CPUSubType < Rhs.CPUSubType; + // force arm64-family to follow after all other slices for + // compatibility with cctools lipo + if (Lhs.CPUType == MachO::CPU_TYPE_ARM64) + return false; + if (Rhs.CPUType == MachO::CPU_TYPE_ARM64) + return true; + // Sort by alignment to minimize file size + return Lhs.P2Alignment < Rhs.P2Alignment; + } +}; + +Error writeUniversalBinary(ArrayRef Slices, StringRef OutputFileName); + +} // end namespace object + +} // end namespace llvm + +#endif // LLVM_OBJECT_MACHOUNIVERSALWRITER_H diff --git a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h index 5837c69ed59f9..c7c3070651504 100644 --- a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h +++ b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h @@ -33,14 +33,19 @@ Error emitDebugStr(raw_ostream &OS, const Data &DI); Error emitDebugAranges(raw_ostream &OS, const Data &DI); Error emitDebugRanges(raw_ostream &OS, const Data &DI); -Error emitPubSection(raw_ostream &OS, const PubSection &Sect, - bool IsLittleEndian, bool IsGNUPubSec = false); +Error emitDebugPubnames(raw_ostream &OS, const Data &DI); +Error emitDebugPubtypes(raw_ostream &OS, const Data &DI); +Error emitDebugGNUPubnames(raw_ostream &OS, const Data &DI); +Error emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI); Error emitDebugInfo(raw_ostream &OS, const Data &DI); Error emitDebugLine(raw_ostream &OS, const Data &DI); Error emitDebugAddr(raw_ostream &OS, const Data &DI); Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI); Error emitDebugRnglists(raw_ostream &OS, const Data &DI); +Error emitDebugLoclists(raw_ostream &OS, const Data &DI); +std::function +getDWARFEmitterByName(StringRef SecName); Expected>> emitDebugSections(StringRef YAMLString, bool IsLittleEndian = sys::IsLittleEndianHost); diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h index 7b18ceebdb1cf..ae3eff1fe8564 100644 --- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h +++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h @@ -18,6 +18,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/ObjectYAML/YAML.h" #include "llvm/Support/YAMLTraits.h" #include #include @@ -60,16 +61,16 @@ struct Abbrev { struct ARangeDescriptor { llvm::yaml::Hex64 Address; - uint64_t Length; + yaml::Hex64 Length; }; struct ARange { dwarf::DwarfFormat Format; - uint64_t Length; + Optional Length; uint16_t Version; - uint32_t CuOffset; - uint8_t AddrSize; - uint8_t SegSize; + yaml::Hex64 CuOffset; + Optional AddrSize; + yaml::Hex8 SegSize; std::vector Descriptors; }; @@ -183,13 +184,26 @@ struct StringOffsetsTable { std::vector Offsets; }; +struct DWARFOperation { + dwarf::LocationAtom Operator; + std::vector Values; +}; + struct RnglistEntry { dwarf::RnglistEntries Operator; std::vector Values; }; +struct LoclistEntry { + dwarf::LoclistEntries Operator; + std::vector Values; + Optional DescriptionsLength; + std::vector Descriptions; +}; + template struct ListEntries { - std::vector Entries; + Optional> Entries; + Optional Content; }; template struct ListTable { @@ -209,7 +223,7 @@ struct Data { std::vector AbbrevDecls; std::vector DebugStrings; Optional> DebugStrOffsets; - std::vector ARanges; + Optional> DebugAranges; std::vector DebugRanges; std::vector DebugAddr; Optional PubNames; @@ -222,10 +236,11 @@ struct Data { std::vector DebugLines; Optional>> DebugRnglists; + Optional>> DebugLoclists; bool isEmpty() const; - SetVector getUsedSectionNames() const; + SetVector getNonEmptySectionNames() const; }; } // end namespace DWARFYAML @@ -252,6 +267,12 @@ LLVM_YAML_IS_SEQUENCE_VECTOR( LLVM_YAML_IS_SEQUENCE_VECTOR( llvm::DWARFYAML::ListEntries) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::RnglistEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR( + llvm::DWARFYAML::ListTable) +LLVM_YAML_IS_SEQUENCE_VECTOR( + llvm::DWARFYAML::ListEntries) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::LoclistEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::DWARFYAML::DWARFOperation) namespace llvm { namespace yaml { @@ -320,6 +341,10 @@ template <> struct MappingTraits { static void mapping(IO &IO, DWARFYAML::SegAddrPair &SegAddrPair); }; +template <> struct MappingTraits { + static void mapping(IO &IO, DWARFYAML::DWARFOperation &DWARFOperation); +}; + template struct MappingTraits> { static void mapping(IO &IO, DWARFYAML::ListTable &ListTable); @@ -328,12 +353,18 @@ struct MappingTraits> { template struct MappingTraits> { static void mapping(IO &IO, DWARFYAML::ListEntries &ListEntries); + static StringRef validate(IO &IO, + DWARFYAML::ListEntries &ListEntries); }; template <> struct MappingTraits { static void mapping(IO &IO, DWARFYAML::RnglistEntry &RnglistEntry); }; +template <> struct MappingTraits { + static void mapping(IO &IO, DWARFYAML::LoclistEntry &LoclistEntry); +}; + template <> struct MappingTraits { static void mapping(IO &IO, DWARFYAML::AddrTableEntry &AddrTable); }; @@ -430,6 +461,25 @@ template <> struct ScalarEnumerationTraits { } }; +#define HANDLE_DW_LLE(unused, name) \ + io.enumCase(value, "DW_LLE_" #name, dwarf::DW_LLE_##name); + +template <> struct ScalarEnumerationTraits { + static void enumeration(IO &io, dwarf::LoclistEntries &value) { +#include "llvm/BinaryFormat/Dwarf.def" + } +}; + +#define HANDLE_DW_OP(id, name, version, vendor) \ + io.enumCase(value, "DW_OP_" #name, dwarf::DW_OP_##name); + +template <> struct ScalarEnumerationTraits { + static void enumeration(IO &io, dwarf::LocationAtom &value) { +#include "llvm/BinaryFormat/Dwarf.def" + io.enumFallback(value); + } +}; + } // end namespace yaml } // end namespace llvm diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h index b1ffb20681ea8..9a5c06fdfcd5c 100644 --- a/llvm/include/llvm/ObjectYAML/ELFYAML.h +++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h @@ -206,6 +206,12 @@ struct Section : public Chunk { // This can be used to override the sh_flags field. Optional ShFlags; + + // This can be used to override the sh_type field. It is useful when we + // want to use specific YAML keys for a section of a particular type to + // describe the content, but still want to have a different final type + // for the section. + Optional ShType; }; // Fill is a block of data which is placed outside of sections. It is diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h index bd8b886a2bb3c..fc0f21e404010 100644 --- a/llvm/include/llvm/Passes/StandardInstrumentations.h +++ b/llvm/include/llvm/Passes/StandardInstrumentations.h @@ -39,7 +39,7 @@ class PrintIRInstrumentation { void registerCallbacks(PassInstrumentationCallbacks &PIC); private: - bool printBeforePass(StringRef PassID, Any IR); + void printBeforePass(StringRef PassID, Any IR); void printAfterPass(StringRef PassID, Any IR); void printAfterPassInvalidated(StringRef PassID); @@ -61,7 +61,16 @@ class OptNoneInstrumentation { private: bool skip(StringRef PassID, Any IR); + bool DebugLogging; +}; + +// Debug logging for transformation and analysis passes. +class PrintPassInstrumentation { +public: + PrintPassInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {} + void registerCallbacks(PassInstrumentationCallbacks &PIC); +private: bool DebugLogging; }; @@ -69,12 +78,13 @@ class OptNoneInstrumentation { /// instrumentations and manages their state (if any). class StandardInstrumentations { PrintIRInstrumentation PrintIR; + PrintPassInstrumentation PrintPass; TimePassesHandler TimePasses; OptNoneInstrumentation OptNone; public: StandardInstrumentations(bool DebugLogging) - : PrintIR(), TimePasses(), OptNone(DebugLogging) {} + : PrintPass(DebugLogging), OptNone(DebugLogging) {} void registerCallbacks(PassInstrumentationCallbacks &PIC); diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index a3359ca901339..c767a362d6dcb 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -75,10 +75,18 @@ inline StringRef getInstrProfValueProfFuncName() { } /// Return the name profile runtime entry point to do value range profiling. +// FIXME: This is to be removed after switching to the new memop value +// profiling. inline StringRef getInstrProfValueRangeProfFuncName() { return INSTR_PROF_VALUE_RANGE_PROF_FUNC_STR; } +/// Return the name profile runtime entry point to do memop size value +/// profiling. +inline StringRef getInstrProfValueProfMemOpFuncName() { + return INSTR_PROF_VALUE_PROF_MEMOP_FUNC_STR; +} + /// Return the name prefix of variables containing instrumented function names. inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; } @@ -678,8 +686,8 @@ struct InstrProfValueSiteRecord { /// Optionally scale merged counts by \p Weight. void merge(InstrProfValueSiteRecord &Input, uint64_t Weight, function_ref Warn); - /// Scale up value profile data counts. - void scale(uint64_t Weight, function_ref Warn); + /// Scale up value profile data counts by N (Numerator) / D (Denominator). + void scale(uint64_t N, uint64_t D, function_ref Warn); /// Compute the overlap b/w this record and Input record. void overlap(InstrProfValueSiteRecord &Input, uint32_t ValueKind, @@ -753,8 +761,8 @@ struct InstrProfRecord { function_ref Warn); /// Scale up profile counts (including value profile data) by - /// \p Weight. - void scale(uint64_t Weight, function_ref Warn); + /// a factor of (N / D). + void scale(uint64_t N, uint64_t D, function_ref Warn); /// Sort value profile data (per site) by count. void sortValueData() { @@ -839,8 +847,8 @@ struct InstrProfRecord { uint64_t Weight, function_ref Warn); - // Scale up value profile data count. - void scaleValueProfData(uint32_t ValueKind, uint64_t Weight, + // Scale up value profile data count by N (Numerator) / D (Denominator). + void scaleValueProfData(uint32_t ValueKind, uint64_t N, uint64_t D, function_ref Warn); }; diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index e56623afad644..06927fb5652b9 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -157,6 +157,8 @@ VALUE_PROF_FUNC_PARAM(void *, Data, Type::getInt8PtrTy(Ctx)) INSTR_PROF_COMMA #ifndef VALUE_RANGE_PROF VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) #else /* VALUE_RANGE_PROF */ +/* FIXME: This is to be removed after switching to the new memop value + * profiling. */ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) \ INSTR_PROF_COMMA VALUE_PROF_FUNC_PARAM(uint64_t, PreciseRangeStart, Type::getInt64Ty(Ctx)) \ @@ -754,9 +756,14 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, #define INSTR_PROF_VALUE_PROF_FUNC __llvm_profile_instrument_target #define INSTR_PROF_VALUE_PROF_FUNC_STR \ INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_FUNC) +/* FIXME: This is to be removed after switching to the new memop value + * profiling. */ #define INSTR_PROF_VALUE_RANGE_PROF_FUNC __llvm_profile_instrument_range #define INSTR_PROF_VALUE_RANGE_PROF_FUNC_STR \ INSTR_PROF_QUOTE(INSTR_PROF_VALUE_RANGE_PROF_FUNC) +#define INSTR_PROF_VALUE_PROF_MEMOP_FUNC __llvm_profile_instrument_memop +#define INSTR_PROF_VALUE_PROF_MEMOP_FUNC_STR \ + INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_MEMOP_FUNC) /* InstrProfile per-function control data alignment. */ #define INSTR_PROF_DATA_ALIGNMENT 8 @@ -784,3 +791,121 @@ typedef struct InstrProfValueData { #endif #undef COVMAP_V2_OR_V3 + +#ifdef INSTR_PROF_VALUE_PROF_MEMOP_API + +#ifdef __cplusplus +#define INSTR_PROF_INLINE inline +#else +#define INSTR_PROF_INLINE +#endif + +/* The value range buckets (22 buckets) for the memop size value profiling looks + * like: + * + * [0, 0] + * [1, 1] + * [2, 2] + * [3, 3] + * [4, 4] + * [5, 5] + * [6, 6] + * [7, 7] + * [8, 8] + * [9, 15] + * [16, 16] + * [17, 31] + * [32, 32] + * [33, 63] + * [64, 64] + * [65, 127] + * [128, 128] + * [129, 255] + * [256, 256] + * [257, 511] + * [512, 512] + * [513, UINT64_MAX] + * + * Each range has a 'representative value' which is the lower end value of the + * range and used to store in the runtime profile data records and the VP + * metadata. For example, it's 2 for [2, 2] and 64 for [65, 127]. + */ + +/* + * Clz and Popcount. This code was copied from + * compiler-rt/lib/fuzzer/{FuzzerBuiltins.h,FuzzerBuiltinsMsvc.h} and + * llvm/include/llvm/Support/MathExtras.h. + */ +#if defined(_MSC_VER) && !defined(__clang__) + +#include +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE +int InstProfClzll(unsigned long long X) { + unsigned long LeadZeroIdx = 0; +#if !defined(_M_ARM64) && !defined(_M_X64) + // Scan the high 32 bits. + if (_BitScanReverse(&LeadZeroIdx, (unsigned long)(X >> 32))) + return (int)(63 - (LeadZeroIdx + 32)); // Create a bit offset + // from the MSB. + // Scan the low 32 bits. + if (_BitScanReverse(&LeadZeroIdx, (unsigned long)(X))) + return (int)(63 - LeadZeroIdx); +#else + if (_BitScanReverse64(&LeadZeroIdx, X)) return 63 - LeadZeroIdx; +#endif + return 64; +} +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE +int InstProfPopcountll(unsigned long long X) { + // This code originates from https://reviews.llvm.org/rG30626254510f. + unsigned long long v = X; + v = v - ((v >> 1) & 0x5555555555555555ULL); + v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); + v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return (int)((unsigned long long)(v * 0x0101010101010101ULL) >> 56); +} + +#else + +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE +int InstProfClzll(unsigned long long X) { return __builtin_clzll(X); } +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE +int InstProfPopcountll(unsigned long long X) { return __builtin_popcountll(X); } + +#endif /* defined(_MSC_VER) && !defined(__clang__) */ + +/* Map an (observed) memop size value to the representative value of its range. + * For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */ +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE uint64_t +InstrProfGetRangeRepValue(uint64_t Value) { + if (Value <= 8) + // The first ranges are individually tracked. Use the value as is. + return Value; + else if (Value >= 513) + // The last range is mapped to its lowest value. + return 513; + else if (InstProfPopcountll(Value) == 1) + // If it's a power of two, use it as is. + return Value; + else + // Otherwise, take to the previous power of two + 1. + return (1 << (64 - InstProfClzll(Value) - 1)) + 1; +} + +/* Return true if the range that an (observed) memop size value belongs to has + * only a single value in the range. For example, 0 -> true, 8 -> true, 10 -> + * false, 64 -> true, 100 -> false, 513 -> false. */ +INSTR_PROF_VISIBILITY INSTR_PROF_INLINE unsigned +InstrProfIsSingleValRange(uint64_t Value) { + if (Value <= 8) + // The first ranges are individually tracked. + return 1; + else if (InstProfPopcountll(Value) == 1) + // If it's a power of two, there's only one value. + return 1; + else + // Otherwise, there's more than one value in the range. + return 0; +} + +#endif /* INSTR_PROF_VALUE_PROF_MEMOP_API */ diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h index 2d69bba26a29c..35c2669d55a69 100644 --- a/llvm/include/llvm/ProfileData/InstrProfWriter.h +++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h @@ -48,6 +48,8 @@ class InstrProfWriter { InstrProfWriter(bool Sparse = false, bool InstrEntryBBEnabled = false); ~InstrProfWriter(); + StringMap &getProfileData() { return FunctionData; } + /// Add function counts for the given function. If there are already counts /// for this function and the hash and number of counts match, each counter is /// summed. Optionally scale counts by \p Weight. diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index 562468333ef47..fa5326038ada0 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -515,6 +515,8 @@ class FunctionSamples { sampleprof_error merge(const FunctionSamples &Other, uint64_t Weight = 1) { sampleprof_error Result = sampleprof_error::success; Name = Other.getName(); + if (!GUIDToFuncNameMap) + GUIDToFuncNameMap = Other.GUIDToFuncNameMap; MergeResult(Result, addTotalSamples(Other.getTotalSamples(), Weight)); MergeResult(Result, addHeadSamples(Other.getHeadSamples(), Weight)); for (const auto &I : Other.getBodySamples()) { diff --git a/llvm/include/llvm/Support/ARMTargetParser.h b/llvm/include/llvm/Support/ARMTargetParser.h index 4e76b3c4b83e0..7dd2abd292128 100644 --- a/llvm/include/llvm/Support/ARMTargetParser.h +++ b/llvm/include/llvm/Support/ARMTargetParser.h @@ -250,7 +250,8 @@ StringRef getSubArch(ArchKind AK); StringRef getArchExtName(uint64_t ArchExtKind); StringRef getArchExtFeature(StringRef ArchExt); bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, StringRef ArchExt, - std::vector &Features); + std::vector &Features, + unsigned &ArgFPUKind); StringRef getHWDivName(uint64_t HWDivKind); // Information by Name diff --git a/llvm/include/llvm/Support/CFGDiff.h b/llvm/include/llvm/Support/CFGDiff.h index 94734ce70e02c..a4a4b2ca44b1f 100644 --- a/llvm/include/llvm/Support/CFGDiff.h +++ b/llvm/include/llvm/Support/CFGDiff.h @@ -30,67 +30,43 @@ // a non-inversed graph, the children are naturally the successors when // InverseEdge is false and the predecessors when InverseEdge is true. -// We define two base clases that call into GraphDiff, one for successors -// (CFGSuccessors), where InverseEdge is false, and one for predecessors -// (CFGPredecessors), where InverseEdge is true. -// FIXME: Further refactoring may merge the two base classes into a single one -// templated / parametrized on using succ_iterator/pred_iterator and false/true -// for the InverseEdge. - -// CFGViewChildren and CFGViewPredecessors, both can be parametrized to -// consider the graph inverted or not (i.e. InverseGraph). Successors -// implicitly has InverseEdge = false and Predecessors implicitly has -// InverseEdge = true (see calls to GraphDiff methods in there). The GraphTraits -// instantiations that follow define the value of InverseGraph. - -// GraphTraits instantiations: -// - GraphDiff is equivalent to InverseGraph = false -// - GraphDiff> is equivalent to InverseGraph = true -// - second pair item is BasicBlock *, then InverseEdge = false (so it inherits -// from CFGViewChildren). -// - second pair item is Inverse, then InverseEdge = true (so it -// inherits from CFGViewPredecessors). - -// The 4 GraphTraits are as follows: -// 1. std::pair *, BasicBlock *>> : -// CFGViewChildren -// Regular CFG, children means successors, InverseGraph = false, -// InverseEdge = false. -// 2. std::pair> *, BasicBlock *>> : -// CFGViewChildren -// Reverse the graph, get successors but reverse-apply updates, -// InverseGraph = true, InverseEdge = false. -// 3. std::pair *, Inverse>> : -// CFGViewPredecessors -// Regular CFG, reverse edges, so children mean predecessors, -// InverseGraph = false, InverseEdge = true. -// 4. std::pair> *, Inverse> -// : CFGViewPredecessors -// Reverse the graph and the edges, InverseGraph = true, InverseEdge = true. - namespace llvm { -// GraphDiff defines a CFG snapshot: given a set of Update, provide -// utilities to skip edges marked as deleted and return a set of edges marked as -// newly inserted. The current diff treats the CFG as a graph rather than a +namespace detail { +template +auto reverse_if_helper(Range &&R, std::integral_constant) { + return std::forward(R); +} + +template +auto reverse_if_helper(Range &&R, std::integral_constant) { + return llvm::reverse(std::forward(R)); +} + +template auto reverse_if(Range &&R) { + return reverse_if_helper(std::forward(R), + std::integral_constant{}); +} +} // namespace detail + +// GraphDiff defines a CFG snapshot: given a set of Update, provides +// a getChildren method to get a Node's children based on the additional updates +// in the snapshot. The current diff treats the CFG as a graph rather than a // multigraph. Added edges are pruned to be unique, and deleted edges will // remove all existing edges between two blocks. template class GraphDiff { - using UpdateMapType = SmallDenseMap>; - struct EdgesInsertedDeleted { - UpdateMapType Succ; - UpdateMapType Pred; + struct DeletesInserts { + SmallVector DI[2]; }; - // Store Deleted edges on position 0, and Inserted edges on position 1. - EdgesInsertedDeleted Edges[2]; + using UpdateMapType = SmallDenseMap; + UpdateMapType Succ; + UpdateMapType Pred; + // By default, it is assumed that, given a CFG and a set of updates, we wish // to apply these updates as given. If UpdatedAreReverseApplied is set, the // updates will be applied in reverse: deleted edges are considered re-added // and inserted edges are considered deleted when returning children. bool UpdatedAreReverseApplied; - // Using a singleton empty vector for all node requests with no - // children. - SmallVector Empty; // Keep the list of legalized updates for a deterministic order of updates // when using a GraphDiff for incremental updates in the DominatorTree. @@ -98,14 +74,19 @@ template class GraphDiff { SmallVector, 4> LegalizedUpdates; void printMap(raw_ostream &OS, const UpdateMapType &M) const { - for (auto Pair : M) - for (auto Child : Pair.second) { - OS << "("; - Pair.first->printAsOperand(OS, false); - OS << ", "; - Child->printAsOperand(OS, false); - OS << ") "; + StringRef DIText[2] = {"Delete", "Insert"}; + for (auto Pair : M) { + for (unsigned IsInsert = 0; IsInsert <= 1; ++IsInsert) { + OS << DIText[IsInsert] << " edges: \n"; + for (auto Child : Pair.second.DI[IsInsert]) { + OS << "("; + Pair.first->printAsOperand(OS, false); + OS << ", "; + Child->printAsOperand(OS, false); + OS << ") "; + } } + } OS << "\n"; } @@ -113,15 +94,12 @@ template class GraphDiff { GraphDiff() : UpdatedAreReverseApplied(false) {} GraphDiff(ArrayRef> Updates, bool ReverseApplyUpdates = false) { - cfg::LegalizeUpdates(Updates, LegalizedUpdates, InverseGraph, - /*ReverseResultOrder=*/true); - // The legalized updates are stored in reverse so we can pop_back when doing - // incremental updates. + cfg::LegalizeUpdates(Updates, LegalizedUpdates, InverseGraph); for (auto U : LegalizedUpdates) { unsigned IsInsert = (U.getKind() == cfg::UpdateKind::Insert) == !ReverseApplyUpdates; - Edges[IsInsert].Succ[U.getFrom()].push_back(U.getTo()); - Edges[IsInsert].Pred[U.getTo()].push_back(U.getFrom()); + Succ[U.getFrom()].DI[IsInsert].push_back(U.getTo()); + Pred[U.getTo()].DI[IsInsert].push_back(U.getFrom()); } UpdatedAreReverseApplied = ReverseApplyUpdates; } @@ -137,55 +115,56 @@ template class GraphDiff { auto U = LegalizedUpdates.pop_back_val(); unsigned IsInsert = (U.getKind() == cfg::UpdateKind::Insert) == !UpdatedAreReverseApplied; - auto &SuccList = Edges[IsInsert].Succ[U.getFrom()]; + auto &SuccDIList = Succ[U.getFrom()]; + auto &SuccList = SuccDIList.DI[IsInsert]; assert(SuccList.back() == U.getTo()); SuccList.pop_back(); - if (SuccList.empty()) - Edges[IsInsert].Succ.erase(U.getFrom()); + if (SuccList.empty() && SuccDIList.DI[!IsInsert].empty()) + Succ.erase(U.getFrom()); - auto &PredList = Edges[IsInsert].Pred[U.getTo()]; + auto &PredDIList = Pred[U.getTo()]; + auto &PredList = PredDIList.DI[IsInsert]; assert(PredList.back() == U.getFrom()); PredList.pop_back(); - if (PredList.empty()) - Edges[IsInsert].Pred.erase(U.getTo()); + if (PredList.empty() && PredDIList.DI[!IsInsert].empty()) + Pred.erase(U.getTo()); return U; } - bool ignoreChild(const NodePtr BB, NodePtr EdgeEnd, bool InverseEdge) const { - // Used to filter nullptr in clang. - if (EdgeEnd == nullptr) - return true; - auto &DeleteChildren = - (InverseEdge != InverseGraph) ? Edges[0].Pred : Edges[0].Succ; - auto It = DeleteChildren.find(BB); - if (It == DeleteChildren.end()) - return false; - auto &EdgesForBB = It->second; - return llvm::find(EdgesForBB, EdgeEnd) != EdgesForBB.end(); - } + using VectRet = SmallVector; + template VectRet getChildren(NodePtr N) const { + using DirectedNodeT = + std::conditional_t, NodePtr>; + auto R = children(N); + VectRet Res = VectRet(detail::reverse_if(R)); + + // Remove nullptr children for clang. + llvm::erase_value(Res, nullptr); + + auto &Children = (InverseEdge != InverseGraph) ? Pred : Succ; + auto It = Children.find(N); + if (It == Children.end()) + return Res; - iterator_range::const_iterator> - getAddedChildren(const NodePtr BB, bool InverseEdge) const { - auto &InsertChildren = - (InverseEdge != InverseGraph) ? Edges[1].Pred : Edges[1].Succ; - auto It = InsertChildren.find(BB); - if (It == InsertChildren.end()) - return make_range(Empty.begin(), Empty.end()); - return make_range(It->second.begin(), It->second.end()); + // Remove children present in the CFG but not in the snapshot. + for (auto *Child : It->second.DI[0]) + llvm::erase_value(Res, Child); + + // Add children present in the snapshot for not in the real CFG. + auto &AddedChildren = It->second.DI[1]; + Res.insert(Res.end(), AddedChildren.begin(), AddedChildren.end()); + + return Res; } void print(raw_ostream &OS) const { OS << "===== GraphDiff: CFG edge changes to create a CFG snapshot. \n" "===== (Note: notion of children/inverse_children depends on " "the direction of edges and the graph.)\n"; - OS << "Children to insert:\n\t"; - printMap(OS, Edges[1].Succ); - OS << "Children to delete:\n\t"; - printMap(OS, Edges[0].Succ); - OS << "Inverse_children to insert:\n\t"; - printMap(OS, Edges[1].Pred); - OS << "Inverse_children to delete:\n\t"; - printMap(OS, Edges[0].Pred); + OS << "Children to delete/insert:\n\t"; + printMap(OS, Succ); + OS << "Inverse_children to delete/insert:\n\t"; + printMap(OS, Pred); OS << "\n"; } @@ -193,58 +172,6 @@ template class GraphDiff { LLVM_DUMP_METHOD void dump() const { print(dbgs()); } #endif }; - -template > -struct CFGViewChildren { - using DataRef = const GraphDiff *; - using NodeRef = std::pair; - - template - static auto makeChildRange(Range &&R, DataRef DR) { - using Iter = WrappedPairNodeDataIterator(R).begin()), NodeRef, DataRef>; - return make_range(Iter(R.begin(), DR), Iter(R.end(), DR)); - } - - static auto children(NodeRef N) { - - // filter iterator init: - auto R = make_range(GT::child_begin(N.second), GT::child_end(N.second)); - // This lambda is copied into the iterators and persists to callers, ensure - // captures are by value or otherwise have sufficient lifetime. - auto First = make_filter_range(makeChildRange(R, N.first), [N](NodeRef C) { - return !C.first->ignoreChild(N.second, C.second, InverseEdge); - }); - - // new inserts iterator init: - auto InsertVec = N.first->getAddedChildren(N.second, InverseEdge); - auto Second = makeChildRange(InsertVec, N.first); - - auto CR = concat(First, Second); - - // concat_range contains references to other ranges, returning it would - // leave those references dangling - the iterators contain - // other iterators by value so they're safe to return. - return make_range(CR.begin(), CR.end()); - } - - static auto child_begin(NodeRef N) { - return children(N).begin(); - } - - static auto child_end(NodeRef N) { - return children(N).end(); - } - - using ChildIteratorType = decltype(child_end(std::declval())); -}; - -template -struct GraphTraits *, T>> - : CFGViewChildren {}; -template -struct GraphTraits *, Inverse>> - : CFGViewChildren, B, true> {}; } // end namespace llvm #endif // LLVM_SUPPORT_CFGDIFF_H diff --git a/llvm/include/llvm/Support/CMakeLists.txt b/llvm/include/llvm/Support/CMakeLists.txt index da8a4da443edf..aa71b55721810 100644 --- a/llvm/include/llvm/Support/CMakeLists.txt +++ b/llvm/include/llvm/Support/CMakeLists.txt @@ -16,18 +16,32 @@ if(LLVM_APPEND_VC_REV) endif() endif() +set(generated_files "${version_inc}") +if (fake_version_inc) + list(APPEND generated_files "${fake_version_inc}") +endif() + # Create custom target to generate the VC revision include. -add_custom_command(OUTPUT "${version_inc}" "${fake_version_inc}" - DEPENDS "${llvm_vc}" "${generate_vcs_version_script}" - COMMAND ${CMAKE_COMMAND} "-DNAMES=LLVM" - "-DLLVM_SOURCE_DIR=${llvm_source_dir}" - "-DHEADER_FILE=${version_inc}" - -P "${generate_vcs_version_script}") +if (fake_version_inc) + add_custom_command(OUTPUT "${version_inc}" "${fake_version_inc}" + DEPENDS "${llvm_vc}" "${generate_vcs_version_script}" + COMMAND ${CMAKE_COMMAND} "-DNAMES=LLVM" + "-DLLVM_SOURCE_DIR=${llvm_source_dir}" + "-DHEADER_FILE=${version_inc}" + -P "${generate_vcs_version_script}") +else() + add_custom_command(OUTPUT "${version_inc}" + DEPENDS "${llvm_vc}" "${generate_vcs_version_script}" + COMMAND ${CMAKE_COMMAND} "-DNAMES=LLVM" + "-DLLVM_SOURCE_DIR=${llvm_source_dir}" + "-DHEADER_FILE=${version_inc}" + -P "${generate_vcs_version_script}") +endif() # Mark the generated header as being generated. set_source_files_properties("${version_inc}" PROPERTIES GENERATED TRUE HEADER_FILE_ONLY TRUE) -add_custom_target(llvm_vcsrevision_h ALL DEPENDS "${version_inc}" "${fake_version_inc}") +add_custom_target(llvm_vcsrevision_h ALL DEPENDS "${generated_files}") set_target_properties(llvm_vcsrevision_h PROPERTIES FOLDER "Misc") diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h index 466945e40a9ce..62e44aeefe9cf 100644 --- a/llvm/include/llvm/Support/CommandLine.h +++ b/llvm/include/llvm/Support/CommandLine.h @@ -71,13 +71,6 @@ bool ParseCommandLineOptions(int argc, const char *const *argv, const char *EnvVar = nullptr, bool LongOptionsUseDoubleDash = false); -//===----------------------------------------------------------------------===// -// ParseEnvironmentOptions - Environment variable option processing alternate -// entry point. -// -void ParseEnvironmentOptions(const char *progName, const char *envvar, - const char *Overview = ""); - // Function pointer type for printing version information. using VersionPrinterTy = std::function; diff --git a/llvm/include/llvm/Support/FileCheck.h b/llvm/include/llvm/Support/FileCheck.h index 2f0e641394d57..1eb609ef725eb 100644 --- a/llvm/include/llvm/Support/FileCheck.h +++ b/llvm/include/llvm/Support/FileCheck.h @@ -39,10 +39,6 @@ struct FileCheckRequest { bool VerboseVerbose = false; }; -//===----------------------------------------------------------------------===// -// Summary of a FileCheck diagnostic. -//===----------------------------------------------------------------------===// - namespace Check { enum FileCheckKind { @@ -86,6 +82,7 @@ class FileCheckType { }; } // namespace Check +/// Summary of a FileCheck diagnostic. struct FileCheckDiag { /// What is the FileCheck directive for this diagnostic? Check::FileCheckType CheckTy; @@ -131,8 +128,12 @@ struct FileCheckDiag { unsigned InputStartCol; unsigned InputEndLine; unsigned InputEndCol; + /// A note to replace the one normally indicated by MatchTy, or the empty + /// string if none. + std::string Note; FileCheckDiag(const SourceMgr &SM, const Check::FileCheckType &CheckTy, - SMLoc CheckLoc, MatchType MatchTy, SMRange InputRange); + SMLoc CheckLoc, MatchType MatchTy, SMRange InputRange, + StringRef Note = ""); }; class FileCheckPatternContext; diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h index a29a9d787947f..5a8842ee26e21 100644 --- a/llvm/include/llvm/Support/FileSystem.h +++ b/llvm/include/llvm/Support/FileSystem.h @@ -1131,6 +1131,43 @@ Expected openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None, SmallVectorImpl *RealPath = nullptr); +/// Try to locks the file during the specified time. +/// +/// This function implements advisory locking on entire file. If it returns +/// errc::success, the file is locked by the calling process. Until the +/// process unlocks the file by calling \a unlockFile, all attempts to lock the +/// same file will fail/block. The process that locked the file may assume that +/// none of other processes read or write this file, provided that all processes +/// lock the file prior to accessing its content. +/// +/// @param FD The descriptor representing the file to lock. +/// @param Timeout Time in milliseconds that the process should wait before +/// reporting lock failure. Zero value means try to get lock only +/// once. +/// @returns errc::success if lock is successfully obtained, +/// errc::no_lock_available if the file cannot be locked, or platform-specific +/// error_code otherwise. +/// +/// @note Care should be taken when using this function in a multithreaded +/// context, as it may not prevent other threads in the same process from +/// obtaining a lock on the same file, even if they are using a different file +/// descriptor. +std::error_code +tryLockFile(int FD, + std::chrono::milliseconds Timeout = std::chrono::milliseconds(0)); + +/// Lock the file. +/// +/// This function acts as @ref tryLockFile but it waits infinitely. +std::error_code lockFile(int FD); + +/// Unlock the file. +/// +/// @param FD The descriptor representing the file to unlock. +/// @returns errc::success if lock is successfully released or platform-specific +/// error_code otherwise. +std::error_code unlockFile(int FD); + /// @brief Close the file object. This should be used instead of ::close for /// portability. On error, the caller should assume the file is closed, as is /// the case for Process::SafelyCloseFileDescriptor @@ -1142,6 +1179,35 @@ openNativeFileForRead(const Twine &Name, OpenFlags Flags = OF_None, /// means that the filesystem may have failed to perform some buffered writes. std::error_code closeFile(file_t &F); +/// RAII class that facilitates file locking. +class FileLocker { + int FD; ///< Locked file handle. + FileLocker(int FD) : FD(FD) {} + friend class llvm::raw_fd_ostream; + +public: + FileLocker(const FileLocker &L) = delete; + FileLocker(FileLocker &&L) : FD(L.FD) { L.FD = -1; } + ~FileLocker() { + if (FD != -1) + unlockFile(FD); + } + FileLocker &operator=(FileLocker &&L) { + FD = L.FD; + L.FD = -1; + return *this; + } + FileLocker &operator=(const FileLocker &L) = delete; + std::error_code unlock() { + if (FD != -1) { + std::error_code Result = unlockFile(FD); + FD = -1; + return Result; + } + return std::error_code(); + } +}; + std::error_code getUniqueID(const Twine Path, UniqueID &Result); /// Get disk space usage information. diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h index 10e591a69d369..325365824b3b9 100644 --- a/llvm/include/llvm/Support/GenericDomTree.h +++ b/llvm/include/llvm/Support/GenericDomTree.h @@ -28,6 +28,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Support/CFGDiff.h" #include "llvm/Support/CFGUpdate.h" #include "llvm/Support/raw_ostream.h" #include @@ -211,7 +212,8 @@ void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From, template void ApplyUpdates(DomTreeT &DT, - ArrayRef Updates); + GraphDiff &PreViewCFG); template bool Verify(const DomTreeT &DT, typename DomTreeT::VerificationLevel VL); @@ -535,10 +537,13 @@ class DominatorTreeBase { /// The type of updates is the same for DomTreeBase and PostDomTreeBase /// with the same template parameter T. /// - /// \param Updates An unordered sequence of updates to perform. + /// \param Updates An unordered sequence of updates to perform. The current + /// CFG and the reverse of these updates provides the pre-view of the CFG. /// void applyUpdates(ArrayRef Updates) { - DomTreeBuilder::ApplyUpdates(*this, Updates); + GraphDiff PreViewCFG( + Updates, /*ReverseApplyUpdates=*/true); + DomTreeBuilder::ApplyUpdates(*this, PreViewCFG); } /// Inform the dominator tree about a CFG edge insertion and update the tree. diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h index 464de4e2b3ba1..6a9d38bceb388 100644 --- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h +++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h @@ -58,6 +58,7 @@ struct SemiNCAInfo { using TreeNodePtr = DomTreeNodeBase *; using RootsT = decltype(DomTreeT::Roots); static constexpr bool IsPostDom = DomTreeT::IsPostDominator; + using GraphDiffT = GraphDiff; // Information record used by Semi-NCA during tree construction. struct InfoRec { @@ -77,21 +78,16 @@ struct SemiNCAInfo { using UpdateT = typename DomTreeT::UpdateType; using UpdateKind = typename DomTreeT::UpdateKind; struct BatchUpdateInfo { - SmallVector Updates; - using NodePtrAndKind = PointerIntPair; - - // In order to be able to walk a CFG that is out of sync with the CFG - // DominatorTree last knew about, use the list of updates to reconstruct - // previous CFG versions of the current CFG. For each node, we store a set - // of its virtually added/deleted future successors and predecessors. - // Note that these children are from the future relative to what the - // DominatorTree knows about -- using them to gets us some snapshot of the - // CFG from the past (relative to the state of the CFG). - DenseMap> FutureSuccessors; - DenseMap> FuturePredecessors; + // Note: Updates inside PreViewCFG are aleady legalized. + BatchUpdateInfo(GraphDiffT &PreViewCFG) + : PreViewCFG(PreViewCFG), + NumLegalized(PreViewCFG.getNumLegalizedUpdates()) {} + // Remembers if the whole tree was recalculated at some point during the // current batch update. bool IsRecalculated = false; + GraphDiffT &PreViewCFG; + const size_t NumLegalized; }; BatchUpdateInfo *BatchUpdates; @@ -107,66 +103,13 @@ struct SemiNCAInfo { // in progress, we need this information to continue it. } - template - struct ChildrenGetter { - using ResultTy = SmallVector; - - static ResultTy Get(NodePtr N, std::integral_constant) { - auto RChildren = reverse(children(N)); - return ResultTy(RChildren.begin(), RChildren.end()); - } - - static ResultTy Get(NodePtr N, std::integral_constant) { - auto IChildren = inverse_children(N); - return ResultTy(IChildren.begin(), IChildren.end()); - } - - using Tag = std::integral_constant; - - // The function below is the core part of the batch updater. It allows the - // Depth Based Search algorithm to perform incremental updates in lockstep - // with updates to the CFG. We emulated lockstep CFG updates by getting its - // next snapshots by reverse-applying future updates. - static ResultTy Get(NodePtr N, BatchUpdatePtr BUI) { - ResultTy Res = Get(N, Tag()); - // If there's no batch update in progress, simply return node's children. - if (!BUI) return Res; - - // CFG children are actually its *most current* children, and we have to - // reverse-apply the future updates to get the node's children at the - // point in time the update was performed. - auto &FutureChildren = (Inverse != IsPostDom) ? BUI->FuturePredecessors - : BUI->FutureSuccessors; - auto FCIt = FutureChildren.find(N); - if (FCIt == FutureChildren.end()) return Res; - - for (auto ChildAndKind : FCIt->second) { - const NodePtr Child = ChildAndKind.getPointer(); - const UpdateKind UK = ChildAndKind.getInt(); - - // Reverse-apply the future update. - if (UK == UpdateKind::Insert) { - // If there's an insertion in the future, it means that the edge must - // exist in the current CFG, but was not present in it before. - assert(llvm::find(Res, Child) != Res.end() - && "Expected child not found in the CFG"); - Res.erase(std::remove(Res.begin(), Res.end(), Child), Res.end()); - LLVM_DEBUG(dbgs() << "\tHiding edge " << BlockNamePrinter(N) << " -> " - << BlockNamePrinter(Child) << "\n"); - } else { - // If there's an deletion in the future, it means that the edge cannot - // exist in the current CFG, but existed in it before. - assert(llvm::find(Res, Child) == Res.end() && - "Unexpected child found in the CFG"); - LLVM_DEBUG(dbgs() << "\tShowing virtual edge " << BlockNamePrinter(N) - << " -> " << BlockNamePrinter(Child) << "\n"); - Res.push_back(Child); - } - } - - return Res; - } - }; + template + static SmallVector getChildren(NodePtr N, BatchUpdatePtr BUI) { + if (BUI) + return BUI->PreViewCFG.template getChildren(N); + GraphDiffT GD; + return GD.template getChildren(N); + } NodePtr getIDom(NodePtr BB) const { auto InfoIt = NodeToInfo.find(BB); @@ -233,8 +176,7 @@ struct SemiNCAInfo { NumToNode.push_back(BB); constexpr bool Direction = IsReverse != IsPostDom; // XOR. - for (const NodePtr Succ : - ChildrenGetter::Get(BB, BatchUpdates)) { + for (const NodePtr Succ : getChildren(BB, BatchUpdates)) { const auto SIT = NodeToInfo.find(Succ); // Don't visit nodes more than once but remember to collect // ReverseChildren. @@ -369,7 +311,7 @@ struct SemiNCAInfo { // to CFG nodes within infinite loops. static bool HasForwardSuccessors(const NodePtr N, BatchUpdatePtr BUI) { assert(N && "N must be a valid node"); - return !ChildrenGetter::Get(N, BUI).empty(); + return !getChildren(N, BUI).empty(); } static NodePtr GetEntryNode(const DomTreeT &DT) { @@ -787,8 +729,7 @@ struct SemiNCAInfo { // // Invariant: there is an optimal path from `To` to TN with the minimum // depth being CurrentLevel. - for (const NodePtr Succ : - ChildrenGetter::Get(TN->getBlock(), BUI)) { + for (const NodePtr Succ : getChildren(TN->getBlock(), BUI)) { const TreeNodePtr SuccTN = DT.getNode(Succ); assert(SuccTN && "Unreachable successor found at reachable insertion"); @@ -918,7 +859,7 @@ struct SemiNCAInfo { // the DomTree about it. // The check is O(N), so run it only in debug configuration. auto IsSuccessor = [BUI](const NodePtr SuccCandidate, const NodePtr Of) { - auto Successors = ChildrenGetter::Get(Of, BUI); + auto Successors = getChildren(Of, BUI); return llvm::find(Successors, SuccCandidate) != Successors.end(); }; (void)IsSuccessor; @@ -1005,15 +946,14 @@ struct SemiNCAInfo { const TreeNodePtr TN) { LLVM_DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) << "\n"); - for (const NodePtr Pred : - ChildrenGetter::Get(TN->getBlock(), BUI)) { + auto TNB = TN->getBlock(); + for (const NodePtr Pred : getChildren(TNB, BUI)) { LLVM_DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n"); if (!DT.getNode(Pred)) continue; - const NodePtr Support = - DT.findNearestCommonDominator(TN->getBlock(), Pred); + const NodePtr Support = DT.findNearestCommonDominator(TNB, Pred); LLVM_DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n"); - if (Support != TN->getBlock()) { + if (Support != TNB) { LLVM_DEBUG(dbgs() << "\t" << BlockNamePrinter(TN) << " is reachable from support " << BlockNamePrinter(Support) << "\n"); @@ -1144,53 +1084,23 @@ struct SemiNCAInfo { //===--------------------- DomTree Batch Updater --------------------------=== //~~ - static void ApplyUpdates(DomTreeT &DT, ArrayRef Updates) { - const size_t NumUpdates = Updates.size(); + static void ApplyUpdates(DomTreeT &DT, GraphDiffT &PreViewCFG) { + const size_t NumUpdates = PreViewCFG.getNumLegalizedUpdates(); if (NumUpdates == 0) return; // Take the fast path for a single update and avoid running the batch update // machinery. if (NumUpdates == 1) { - const auto &Update = Updates.front(); + UpdateT Update = PreViewCFG.popUpdateForIncrementalUpdates(); if (Update.getKind() == UpdateKind::Insert) - DT.insertEdge(Update.getFrom(), Update.getTo()); + InsertEdge(DT, /*BUI=*/nullptr, Update.getFrom(), Update.getTo()); else - DT.deleteEdge(Update.getFrom(), Update.getTo()); - + DeleteEdge(DT, /*BUI=*/nullptr, Update.getFrom(), Update.getTo()); return; } - BatchUpdateInfo BUI; - LLVM_DEBUG(dbgs() << "Legalizing " << BUI.Updates.size() << " updates\n"); - cfg::LegalizeUpdates(Updates, BUI.Updates, IsPostDom); - - const size_t NumLegalized = BUI.Updates.size(); - BUI.FutureSuccessors.reserve(NumLegalized); - BUI.FuturePredecessors.reserve(NumLegalized); - - // Use the legalized future updates to initialize future successors and - // predecessors. Note that these sets will only decrease size over time, as - // the next CFG snapshots slowly approach the actual (current) CFG. - for (UpdateT &U : BUI.Updates) { - BUI.FutureSuccessors[U.getFrom()].push_back({U.getTo(), U.getKind()}); - BUI.FuturePredecessors[U.getTo()].push_back({U.getFrom(), U.getKind()}); - } - -#if 0 - // FIXME: The LLVM_DEBUG macro only plays well with a modular - // build of LLVM when the header is marked as textual, but doing - // so causes redefinition errors. - LLVM_DEBUG(dbgs() << "About to apply " << NumLegalized << " updates\n"); - LLVM_DEBUG(if (NumLegalized < 32) for (const auto &U - : reverse(BUI.Updates)) { - dbgs() << "\t"; - U.dump(); - dbgs() << "\n"; - }); - LLVM_DEBUG(dbgs() << "\n"); -#endif - + BatchUpdateInfo BUI(PreViewCFG); // Recalculate the DominatorTree when the number of updates // exceeds a threshold, which usually makes direct updating slower than // recalculation. We select this threshold proportional to the @@ -1200,21 +1110,21 @@ struct SemiNCAInfo { // Make unittests of the incremental algorithm work if (DT.DomTreeNodes.size() <= 100) { - if (NumLegalized > DT.DomTreeNodes.size()) + if (BUI.NumLegalized > DT.DomTreeNodes.size()) CalculateFromScratch(DT, &BUI); - } else if (NumLegalized > DT.DomTreeNodes.size() / 40) + } else if (BUI.NumLegalized > DT.DomTreeNodes.size() / 40) CalculateFromScratch(DT, &BUI); // If the DominatorTree was recalculated at some point, stop the batch // updates. Full recalculations ignore batch updates and look at the actual // CFG. - for (size_t i = 0; i < NumLegalized && !BUI.IsRecalculated; ++i) + for (size_t i = 0; i < BUI.NumLegalized && !BUI.IsRecalculated; ++i) ApplyNextUpdate(DT, BUI); } static void ApplyNextUpdate(DomTreeT &DT, BatchUpdateInfo &BUI) { - assert(!BUI.Updates.empty() && "No updates to apply!"); - UpdateT CurrentUpdate = BUI.Updates.pop_back_val(); + // Popping the next update, will move the PreViewCFG to the next snapshot. + UpdateT CurrentUpdate = BUI.PreViewCFG.popUpdateForIncrementalUpdates(); #if 0 // FIXME: The LLVM_DEBUG macro only plays well with a modular // build of LLVM when the header is marked as textual, but doing @@ -1223,21 +1133,6 @@ struct SemiNCAInfo { LLVM_DEBUG(CurrentUpdate.dump(); dbgs() << "\n"); #endif - // Move to the next snapshot of the CFG by removing the reverse-applied - // current update. Since updates are performed in the same order they are - // legalized it's sufficient to pop the last item here. - auto &FS = BUI.FutureSuccessors[CurrentUpdate.getFrom()]; - assert(FS.back().getPointer() == CurrentUpdate.getTo() && - FS.back().getInt() == CurrentUpdate.getKind()); - FS.pop_back(); - if (FS.empty()) BUI.FutureSuccessors.erase(CurrentUpdate.getFrom()); - - auto &FP = BUI.FuturePredecessors[CurrentUpdate.getTo()]; - assert(FP.back().getPointer() == CurrentUpdate.getFrom() && - FP.back().getInt() == CurrentUpdate.getKind()); - FP.pop_back(); - if (FP.empty()) BUI.FuturePredecessors.erase(CurrentUpdate.getTo()); - if (CurrentUpdate.getKind() == UpdateKind::Insert) InsertEdge(DT, &BUI, CurrentUpdate.getFrom(), CurrentUpdate.getTo()); else @@ -1596,19 +1491,11 @@ void Calculate(DomTreeT &DT) { template void CalculateWithUpdates(DomTreeT &DT, ArrayRef Updates) { - // TODO: Move BUI creation in common method, reuse in ApplyUpdates. - typename SemiNCAInfo::BatchUpdateInfo BUI; - LLVM_DEBUG(dbgs() << "Legalizing " << BUI.Updates.size() << " updates\n"); - cfg::LegalizeUpdates(Updates, BUI.Updates, - DomTreeT::IsPostDominator); - const size_t NumLegalized = BUI.Updates.size(); - BUI.FutureSuccessors.reserve(NumLegalized); - BUI.FuturePredecessors.reserve(NumLegalized); - for (auto &U : BUI.Updates) { - BUI.FutureSuccessors[U.getFrom()].push_back({U.getTo(), U.getKind()}); - BUI.FuturePredecessors[U.getTo()].push_back({U.getFrom(), U.getKind()}); - } - + // FIXME: Updated to use the PreViewCFG and behave the same as until now. + // This behavior is however incorrect; this actually needs the PostViewCFG. + GraphDiff PreViewCFG( + Updates, /*ReverseApplyUpdates=*/true); + typename SemiNCAInfo::BatchUpdateInfo BUI(PreViewCFG); SemiNCAInfo::CalculateFromScratch(DT, &BUI); } @@ -1628,8 +1515,9 @@ void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From, template void ApplyUpdates(DomTreeT &DT, - ArrayRef Updates) { - SemiNCAInfo::ApplyUpdates(DT, Updates); + GraphDiff &PreViewCFG) { + SemiNCAInfo::ApplyUpdates(DT, PreViewCFG); } template diff --git a/llvm/include/llvm/Support/SymbolRemappingReader.h b/llvm/include/llvm/Support/SymbolRemappingReader.h index 2b9ab570eb8b3..820cf9e021920 100644 --- a/llvm/include/llvm/Support/SymbolRemappingReader.h +++ b/llvm/include/llvm/Support/SymbolRemappingReader.h @@ -68,7 +68,7 @@ namespace llvm { class SymbolRemappingParseError : public ErrorInfo { public: - SymbolRemappingParseError(StringRef File, int64_t Line, Twine Message) + SymbolRemappingParseError(StringRef File, int64_t Line, const Twine &Message) : File(File), Line(Line), Message(Message.str()) {} void log(llvm::raw_ostream &OS) const override { diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index f61b3f9acb3aa..572da717bc0b4 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -294,6 +294,12 @@ HANDLE_TARGET_OPCODE(G_INTRINSIC_TRUNC) /// INTRINSIC round intrinsic. HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUND) +/// INTRINSIC round to integer intrinsic. +HANDLE_TARGET_OPCODE(G_INTRINSIC_LRINT) + +/// INTRINSIC roundeven intrinsic. +HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUNDEVEN) + /// INTRINSIC readcyclecounter HANDLE_TARGET_OPCODE(G_READCYCLECOUNTER) @@ -469,6 +475,30 @@ HANDLE_TARGET_OPCODE(G_USUBSAT) /// Generic saturating signed subtraction. HANDLE_TARGET_OPCODE(G_SSUBSAT) +// Perform signed fixed point multiplication +HANDLE_TARGET_OPCODE(G_SMULFIX) + +// Perform unsigned fixed point multiplication +HANDLE_TARGET_OPCODE(G_UMULFIX) + +// Perform signed, saturating fixed point multiplication +HANDLE_TARGET_OPCODE(G_SMULFIXSAT) + +// Perform unsigned, saturating fixed point multiplication +HANDLE_TARGET_OPCODE(G_UMULFIXSAT) + +// Perform signed fixed point division +HANDLE_TARGET_OPCODE(G_SDIVFIX) + +// Perform unsigned fixed point division +HANDLE_TARGET_OPCODE(G_UDIVFIX) + +// Perform signed, saturating fixed point division +HANDLE_TARGET_OPCODE(G_SDIVFIXSAT) + +// Perform unsigned, saturating fixed point division +HANDLE_TARGET_OPCODE(G_UDIVFIXSAT) + /// Generic FP addition. HANDLE_TARGET_OPCODE(G_FADD) diff --git a/llvm/include/llvm/Support/YAMLParser.h b/llvm/include/llvm/Support/YAMLParser.h index 53009d7ff4aad..44daf7850904a 100644 --- a/llvm/include/llvm/Support/YAMLParser.h +++ b/llvm/include/llvm/Support/YAMLParser.h @@ -222,7 +222,7 @@ class ScalarNode final : public Node { /// Gets the value of this node as a StringRef. /// - /// \param Storage is used to store the content of the returned StringRef iff + /// \param Storage is used to store the content of the returned StringRef if /// it requires any modification from how it appeared in the source. /// This happens with escaped characters and multi-line literals. StringRef getValue(SmallVectorImpl &Storage) const; diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h index 44e34a4a09b46..acb1d61cf569d 100644 --- a/llvm/include/llvm/Support/YAMLTraits.h +++ b/llvm/include/llvm/Support/YAMLTraits.h @@ -902,24 +902,7 @@ class IO { template void processKeyWithDefault(const char *Key, Optional &Val, const Optional &DefaultValue, bool Required, - Context &Ctx) { - assert(DefaultValue.hasValue() == false && - "Optional shouldn't have a value!"); - void *SaveInfo; - bool UseDefault = true; - const bool sameAsDefault = outputting() && !Val.hasValue(); - if (!outputting() && !Val.hasValue()) - Val = T(); - if (Val.hasValue() && - this->preflightKey(Key, Required, sameAsDefault, UseDefault, - SaveInfo)) { - yamlize(*this, Val.getValue(), Required, Ctx); - this->postflightKey(SaveInfo); - } else { - if (UseDefault) - Val = DefaultValue; - } - } + Context &Ctx); template void processKeyWithDefault(const char *Key, T &Val, const T &DefaultValue, @@ -1625,6 +1608,42 @@ class Output : public IO { StringRef PaddingBeforeContainer; }; +template +void IO::processKeyWithDefault(const char *Key, Optional &Val, + const Optional &DefaultValue, bool Required, + Context &Ctx) { + assert(DefaultValue.hasValue() == false && + "Optional shouldn't have a value!"); + void *SaveInfo; + bool UseDefault = true; + const bool sameAsDefault = outputting() && !Val.hasValue(); + if (!outputting() && !Val.hasValue()) + Val = T(); + if (Val.hasValue() && + this->preflightKey(Key, Required, sameAsDefault, UseDefault, SaveInfo)) { + + // When reading an Optional key from a YAML description, we allow the + // special "" value, which can be used to specify that no value was + // requested, i.e. the DefaultValue will be assigned. The DefaultValue is + // usually None. + bool IsNone = false; + if (!outputting()) + if (auto *Node = dyn_cast(((Input *)this)->getCurrentNode())) + // We use rtrim to ignore possible white spaces that might exist when a + // comment is present on the same line. + IsNone = Node->getRawValue().rtrim(' ') == ""; + + if (IsNone) + Val = DefaultValue; + else + yamlize(*this, Val.getValue(), Required, Ctx); + this->postflightKey(SaveInfo); + } else { + if (UseDefault) + Val = DefaultValue; + } +} + /// YAML I/O does conversion based on types. But often native data types /// are just a typedef of built in intergral types (e.g. int). But the C++ /// type matching system sees through the typedef and all the typedefed types diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h index 8d289f7c765f9..e9c710d0f38fe 100644 --- a/llvm/include/llvm/Support/raw_ostream.h +++ b/llvm/include/llvm/Support/raw_ostream.h @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include +#include #include #include #include @@ -30,12 +31,14 @@ class format_object_base; class FormattedString; class FormattedNumber; class FormattedBytes; +template class LLVM_NODISCARD Expected; namespace sys { namespace fs { enum FileAccess : unsigned; enum OpenFlags : unsigned; enum CreationDisposition : unsigned; +class FileLocker; } // end namespace fs } // end namespace sys @@ -468,7 +471,7 @@ class raw_fd_ostream : public raw_pwrite_stream { /// fsync. void close(); - bool supportsSeeking() { return SupportsSeeking; } + bool supportsSeeking() const { return SupportsSeeking; } /// Flushes the stream and repositions the underlying file descriptor position /// to the offset specified from the beginning of the file. @@ -496,6 +499,38 @@ class raw_fd_ostream : public raw_pwrite_stream { /// - from The Zen of Python, by Tim Peters /// void clear_error() { EC = std::error_code(); } + + /// Locks the underlying file. + /// + /// @returns RAII object that releases the lock upon leaving the scope, if the + /// locking was successful. Otherwise returns corresponding + /// error code. + /// + /// The function blocks the current thread until the lock become available or + /// error occurs. + /// + /// Possible use of this function may be as follows: + /// + /// @code{.cpp} + /// if (auto L = stream.lock()) { + /// // ... do action that require file to be locked. + /// } else { + /// handleAllErrors(std::move(L.takeError()), [&](ErrorInfoBase &EIB) { + /// // ... handle lock error. + /// }); + /// } + /// @endcode + LLVM_NODISCARD Expected lock(); + + /// Tries to lock the underlying file within the specified period. + /// + /// @returns RAII object that releases the lock upon leaving the scope, if the + /// locking was successful. Otherwise returns corresponding + /// error code. + /// + /// It is used as @ref lock. + LLVM_NODISCARD + Expected tryLockFor(std::chrono::milliseconds Timeout); }; /// This returns a reference to a raw_fd_ostream for standard output. Use it diff --git a/llvm/include/llvm/TableGen/Error.h b/llvm/include/llvm/TableGen/Error.h index cf990427f5770..1eed622ab393f 100644 --- a/llvm/include/llvm/TableGen/Error.h +++ b/llvm/include/llvm/TableGen/Error.h @@ -20,6 +20,8 @@ namespace llvm { void PrintNote(const Twine &Msg); void PrintNote(ArrayRef NoteLoc, const Twine &Msg); +LLVM_ATTRIBUTE_NORETURN void PrintFatalNote(ArrayRef ErrorLoc, + const Twine &Msg); void PrintWarning(ArrayRef WarningLoc, const Twine &Msg); void PrintWarning(const char *Loc, const Twine &Msg); diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 03795c0050a2c..841a314ecc90c 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -545,6 +545,77 @@ def G_SSUBSAT : GenericInstruction { let isCommutable = 0; } +/// RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point +/// multiplication on 2 integers with the same width and scale. SCALE +/// represents the scale of both operands as fixed point numbers. This +/// SCALE parameter must be a constant integer. A scale of zero is +/// effectively performing multiplication on 2 integers. +def G_SMULFIX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 1; +} + +def G_UMULFIX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 1; +} + +/// Same as the corresponding unsaturated fixed point instructions, but the +/// result is clamped between the min and max values representable by the +/// bits of the first 2 operands. +def G_SMULFIXSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 1; +} + +def G_UMULFIXSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 1; +} + +/// RESULT = [US]DIVFIX(LHS, RHS, SCALE) - Perform fixed point division on +/// 2 integers with the same width and scale. SCALE represents the scale +/// of both operands as fixed point numbers. This SCALE parameter must be a +/// constant integer. +def G_SDIVFIX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 0; +} + +def G_UDIVFIX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 0; +} + +/// Same as the corresponding unsaturated fixed point instructions, +/// but the result is clamped between the min and max values +/// representable by the bits of the first 2 operands. +def G_SDIVFIXSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 0; +} + +def G_UDIVFIXSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, untyped_imm_0:$scale); + let hasSideEffects = 0; + let isCommutable = 0; +} + //------------------------------------------------------------------------------ // Floating Point Unary Ops. //------------------------------------------------------------------------------ @@ -841,6 +912,18 @@ def G_INTRINSIC_ROUND : GenericInstruction { let hasSideEffects = 0; } +def G_INTRINSIC_LRINT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} + +def G_INTRINSIC_ROUNDEVEN : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1); + let hasSideEffects = 0; +} + def G_READCYCLECOUNTER : GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 40ed6be089ac8..ba19a4635570a 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -257,6 +257,24 @@ def simplify_add_to_sub: GICombineRule < (apply [{ return Helper.applySimplifyAddToSub(*${root}, ${info});}]) >; +// Fold int2ptr(ptr2int(x)) -> x +def p2i_to_i2p_matchinfo: GIDefMatchData<"Register">; +def p2i_to_i2p: GICombineRule< + (defs root:$root, p2i_to_i2p_matchinfo:$info), + (match (wip_match_opcode G_INTTOPTR):$root, + [{ return Helper.matchCombineI2PToP2I(*${root}, ${info}); }]), + (apply [{ return Helper.applyCombineI2PToP2I(*${root}, ${info}); }]) +>; + +// Fold ptr2int(int2ptr(x)) -> x +def i2p_to_p2i_matchinfo: GIDefMatchData<"Register">; +def i2p_to_p2i: GICombineRule< + (defs root:$root, i2p_to_p2i_matchinfo:$info), + (match (wip_match_opcode G_PTRTOINT):$root, + [{ return Helper.matchCombineP2IToI2P(*${root}, ${info}); }]), + (apply [{ return Helper.applyCombineP2IToI2P(*${root}, ${info}); }]) +>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -267,7 +285,8 @@ def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, binop_same_val, binop_left_to_zero, - binop_right_to_zero]>; + binop_right_to_zero, p2i_to_i2p, + i2p_to_p2i]>; def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>; def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index 150834e65b2dc..d3ca771ad3adb 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -71,6 +71,14 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; @@ -117,6 +125,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index f59bc5e5bae59..2a422341fdc84 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -271,6 +271,11 @@ class TargetMachine { return Options.BBSectionsFuncListBuf.get(); } + /// Returns true if a cast between SrcAS and DestAS is a noop. + virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { + return false; + } + /// Get a \c TargetIRAnalysis appropriate for the target. /// /// This is used to construct the new pass manager's target IR analysis pass, diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h index 28e454d3b0fc7..7b73eeaf8e45b 100644 --- a/llvm/include/llvm/Transforms/IPO.h +++ b/llvm/include/llvm/Transforms/IPO.h @@ -155,12 +155,6 @@ Pass *createArgumentPromotionPass(unsigned maxElements = 3); /// createOpenMPOptLegacyPass - OpenMP specific optimizations. Pass *createOpenMPOptLegacyPass(); -//===----------------------------------------------------------------------===// -/// createIPConstantPropagationPass - This pass propagates constants from call -/// sites into the bodies of functions. -/// -ModulePass *createIPConstantPropagationPass(); - //===----------------------------------------------------------------------===// /// createIPSCCPPass - This pass propagates constants from call sites into the /// bodies of functions, and keeps track of whether basic blocks are executable diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 1eead8ee788de..418a007319679 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -97,29 +97,33 @@ #ifndef LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H #define LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H +#include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CGSCCPassManager.h" -#include "llvm/Analysis/CallGraph.h" -#include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/AbstractCallSite.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/DOTGraphTraits.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" namespace llvm { +struct AADepGraphNode; +struct AADepGraph; struct Attributor; struct AbstractAttribute; struct InformationCache; @@ -144,6 +148,74 @@ enum class DepClassTy { }; ///} +/// The data structure for the nodes of a dependency graph +struct AADepGraphNode { +public: + virtual ~AADepGraphNode(){}; + using DepTy = PointerIntPair; + +protected: + /// Set of dependency graph nodes which should be updated if this one + /// is updated. The bit encodes if it is optional. + TinyPtrVector Deps; + + static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); } + static AbstractAttribute *DepGetValAA(DepTy &DT) { + return cast(DT.getPointer()); + } + + operator AbstractAttribute *() { return cast(this); } + +public: + using iterator = + mapped_iterator::iterator, decltype(&DepGetVal)>; + using aaiterator = + mapped_iterator::iterator, decltype(&DepGetValAA)>; + + aaiterator begin() { return aaiterator(Deps.begin(), &DepGetValAA); } + aaiterator end() { return aaiterator(Deps.end(), &DepGetValAA); } + iterator child_begin() { return iterator(Deps.begin(), &DepGetVal); } + iterator child_end() { return iterator(Deps.end(), &DepGetVal); } + + virtual void print(raw_ostream &OS) const { OS << "AADepNode Impl\n"; } + TinyPtrVector &getDeps() { return Deps; } + + friend struct Attributor; + friend struct AADepGraph; +}; + +/// The data structure for the dependency graph +/// +/// Note that in this graph if there is an edge from A to B (A -> B), +/// then it means that B depends on A, and when the state of A is +/// updated, node B should also be updated +struct AADepGraph { + AADepGraph() {} + ~AADepGraph() {} + + using DepTy = AADepGraphNode::DepTy; + static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); } + using iterator = + mapped_iterator::iterator, decltype(&DepGetVal)>; + + /// There is no root node for the dependency graph. But the SCCIterator + /// requires a single entry point, so we maintain a fake("synthetic") root + /// node that depends on every node. + AADepGraphNode SyntheticRoot; + AADepGraphNode *GetEntryNode() { return &SyntheticRoot; } + + iterator begin() { return SyntheticRoot.child_begin(); } + iterator end() { return SyntheticRoot.child_end(); } + + void viewGraph(); + + /// Dump graph to file + void dumpGraph(); + + /// Print dependency graph + void print(); +}; + /// Helper to describe and deal with positions in the LLVM-IR. /// /// A position in the IR is described by an anchor value and an "offset" that @@ -934,8 +1006,10 @@ struct Attributor { return AA; } - AA.initialize(*this); - + { + TimeTraceScope TimeScope(AA.getName() + "::initialize"); + AA.initialize(*this); + } // We can initialize (=look at) code outside the current function set but // not call update because that would again spawn new abstract attributes in // potentially unconnected code regions (=SCCs). @@ -1020,7 +1094,9 @@ struct Attributor { assert(!AAPtr && "Attribute already in map!"); AAPtr = &AA; - AllAbstractAttributes.push_back(&AA); + DG.SyntheticRoot.Deps.push_back( + AADepGraphNode::DepTy(&AA, unsigned(DepClassTy::REQUIRED))); + return AA; } @@ -1382,12 +1458,6 @@ struct Attributor { /// See getOrCreateAAFor. bool shouldSeedAttribute(AbstractAttribute &AA); - /// The set of all abstract attributes. - ///{ - using AAVector = SmallVector; - AAVector AllAbstractAttributes; - ///} - /// A nested map to lookup abstract attributes based on the argument position /// on the outer level, and the addresses of the static member (AAType::ID) on /// the inner level. @@ -1409,6 +1479,9 @@ struct Attributor { /// Helper to update an underlying call graph. CallGraphUpdater &CGUpdater; + /// Abstract Attribute dependency graph + AADepGraph DG; + /// Set of functions for which we modified the content such that it might /// impact the call graph. SmallPtrSet CGModifiedFunctions; @@ -1458,6 +1531,8 @@ struct Attributor { SmallPtrSet ToBeDeletedBlocks; SmallDenseSet ToBeDeletedInsts; ///} + + friend AADepGraph; }; /// An interface to query the internal state of an abstract attribute. @@ -2030,7 +2105,7 @@ struct IRAttribute : public BaseType { /// both directions will be added in the future. /// NOTE: The mechanics of adding a new "concrete" abstract attribute are /// described in the file comment. -struct AbstractAttribute : public IRPosition { +struct AbstractAttribute : public IRPosition, public AADepGraphNode { using StateType = AbstractState; AbstractAttribute(const IRPosition &IRP) : IRPosition(IRP) {} @@ -2038,6 +2113,14 @@ struct AbstractAttribute : public IRPosition { /// Virtual destructor. virtual ~AbstractAttribute() {} + /// This function is used to identify if an \p DGN is of type + /// AbstractAttribute so that the dyn_cast and cast can use such information + /// to cast an AADepGraphNode to an AbstractAttribute. + /// + /// We eagerly return true here because all AADepGraphNodes except for the + /// Synthethis Node are of type AbstractAttribute + static bool classof(const AADepGraphNode *DGN) { return true; } + /// Initialize the state with the information in the Attributor \p A. /// /// This function is called by the Attributor once all abstract attributes @@ -2058,7 +2141,8 @@ struct AbstractAttribute : public IRPosition { /// Helper functions, for debug purposes only. ///{ - virtual void print(raw_ostream &OS) const; + void print(raw_ostream &OS) const override; + virtual void printWithDeps(raw_ostream &OS) const; void dump() const { print(dbgs()); } /// This function should return the "summarized" assumed state as string. @@ -2106,12 +2190,6 @@ struct AbstractAttribute : public IRPosition { /// /// \Return CHANGED if the internal state changed, otherwise UNCHANGED. virtual ChangeStatus updateImpl(Attributor &A) = 0; - -private: - /// Set of abstract attributes which were queried by this one. The bit encodes - /// if there is an optional of required dependence. - using DepTy = PointerIntPair; - TinyPtrVector Deps; }; /// Forward declarations of output streams for debug purposes. diff --git a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h index d96187b73f9bb..9b72ee0afd284 100644 --- a/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h +++ b/llvm/include/llvm/Transforms/IPO/OpenMPOpt.h @@ -33,6 +33,11 @@ struct OpenMPInModule { bool isKnown() { return Value != OpenMP::UNKNOWN; } operator bool() { return Value != OpenMP::NOT_FOUND; } + /// Does this function \p F contain any OpenMP runtime calls? + bool containsOMPRuntimeCalls(Function *F) const { + return FuncsWithOMPRuntimeCalls.contains(F); + } + /// Return the known kernels (=GPU entry points) in the module. SmallPtrSetImpl &getKernels() { return Kernels; } @@ -42,6 +47,11 @@ struct OpenMPInModule { private: enum class OpenMP { FOUND, NOT_FOUND, UNKNOWN } Value = OpenMP::UNKNOWN; + friend bool containsOpenMP(Module &M, OpenMPInModule &OMPInModule); + + /// In which functions are OpenMP runtime calls present? + SmallPtrSet FuncsWithOMPRuntimeCalls; + /// Collection of known kernels (=GPU entry points) in the module. SmallPtrSet Kernels; }; diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h index d4373d7b39eaa..0453cb428bc07 100644 --- a/llvm/include/llvm/Transforms/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Instrumentation.h @@ -143,9 +143,8 @@ ModulePass *createInstrProfilingLegacyPass( ModulePass *createInstrOrderFilePass(); // Insert DataFlowSanitizer (dynamic data flow analysis) instrumentation -ModulePass *createDataFlowSanitizerPass( - const std::vector &ABIListFiles = std::vector(), - void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr); +ModulePass *createDataFlowSanitizerLegacyPassPass( + const std::vector &ABIListFiles = std::vector()); // Options for sanitizer coverage instrumentation. struct SanitizerCoverageOptions { diff --git a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h new file mode 100644 index 0000000000000..c43de78df592e --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h @@ -0,0 +1,31 @@ +//===- DataFlowSanitizer.h - dynamic data flow analysis -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_DATAFLOWSANITIZER_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_DATAFLOWSANITIZER_H + +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include +#include + +namespace llvm { + +class DataFlowSanitizerPass : public PassInfoMixin { +private: + std::vector ABIListFiles; + +public: + DataFlowSanitizerPass( + const std::vector &ABIListFiles = std::vector()) + : ABIListFiles(ABIListFiles) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h b/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h index 263d3b629589c..a7052f7b6a2b1 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h +++ b/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h @@ -68,6 +68,8 @@ class InstrProfiling : public PassInfoMixin { // vector of counter load/store pairs to be register promoted. std::vector PromotionCandidates; + // FIXME: These are to be removed after switching to the new memop value + // profiling. // The start value of precise value profile range for memory intrinsic sizes. int64_t MemOPSizeRangeStart; // The end value of precise value profile range for memory intrinsic sizes. diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index 5cc8d1fa74376..546a5c1b96e90 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -305,10 +305,6 @@ bool replaceDbgDeclare(Value *Address, Value *NewAddress, DIBuilder &Builder, void replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, DIBuilder &Builder, int Offset = 0); -/// Finds alloca where the value comes from. -AllocaInst *findAllocaForValue(Value *V, - DenseMap &AllocaForValue); - /// Assuming the instruction \p I is going to be deleted, attempt to salvage /// debug users of \p I by writing the effect of \p I in a DIExpression. If it /// cannot be salvaged changes its debug uses to undef. diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h new file mode 100644 index 0000000000000..8f857e1e5c215 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h @@ -0,0 +1,40 @@ +//===- llvm/Transforms/Utils/LoopPeel.h ----- Peeling utilities -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines some loop peeling utilities. It does not define any +// actual pass or policy. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_LOOPPEEL_H +#define LLVM_TRANSFORMS_UTILS_LOOPPEEL_H + +#include "llvm/Analysis/TargetTransformInfo.h" + +namespace llvm { + +bool canPeel(Loop *L); + +bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE, + DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA); + +TargetTransformInfo::PeelingPreferences +gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, + const TargetTransformInfo &TTI, + Optional UserAllowPeeling, + Optional UserAllowProfileBasedPeeling, + bool UnrollingSpecficValues = false); + +void computePeelCount(Loop *L, unsigned LoopSize, + TargetTransformInfo::PeelingPreferences &PP, + unsigned &TripCount, ScalarEvolution &SE, + unsigned Threshold = UINT_MAX); + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_LOOPPEEL_H diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 60446bca53174..70c8c84c857bf 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -74,9 +74,14 @@ bool formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, /// changes to CFG, preserved. /// /// Returns true if any modifications are made. -bool formLCSSAForInstructions(SmallVectorImpl &Worklist, - const DominatorTree &DT, const LoopInfo &LI, - ScalarEvolution *SE); +/// +/// This function may introduce unused PHI nodes. If \p PHIsToRemove is not +/// nullptr, those are added to it (before removing, the caller has to check if +/// they still do not have any uses). Otherwise the PHIs are directly removed. +bool formLCSSAForInstructions( + SmallVectorImpl &Worklist, const DominatorTree &DT, + const LoopInfo &LI, ScalarEvolution *SE, IRBuilderBase &Builder, + SmallVectorImpl *PHIsToRemove = nullptr); /// Put loop into LCSSA form. /// diff --git a/llvm/include/llvm/Transforms/Utils/LoopVersioning.h b/llvm/include/llvm/Transforms/Utils/LoopVersioning.h index 1efdcc65b39a8..ac6cee637a46d 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopVersioning.h +++ b/llvm/include/llvm/Transforms/Utils/LoopVersioning.h @@ -16,6 +16,7 @@ #define LLVM_TRANSFORMS_UTILS_LOOPVERSIONING_H #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/IR/PassManager.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -148,6 +149,14 @@ class LoopVersioning { DominatorTree *DT; ScalarEvolution *SE; }; + +/// Expose LoopVersioning as a pass. Currently this is only used for +/// unit-testing. It adds all memchecks necessary to remove all may-aliasing +/// array accesses from the loop. +class LoopVersioningPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; } #endif diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h index 6e53d15d25fc7..cb212b2705eed 100644 --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -52,6 +52,9 @@ class SCEVExpander : public SCEVVisitor { // New instructions receive a name to identify them with the current pass. const char *IVName; + /// Indicates whether LCSSA phis should be created for inserted values. + bool PreserveLCSSA; + // InsertedExpressions caches Values for reuse, so must track RAUW. DenseMap, TrackingVH> InsertedExpressions; @@ -146,9 +149,10 @@ class SCEVExpander : public SCEVVisitor { public: /// Construct a SCEVExpander in "canonical" mode. explicit SCEVExpander(ScalarEvolution &se, const DataLayout &DL, - const char *name) - : SE(se), DL(DL), IVName(name), IVIncInsertLoop(nullptr), - IVIncInsertPos(nullptr), CanonicalMode(true), LSRMode(false), + const char *name, bool PreserveLCSSA = true) + : SE(se), DL(DL), IVName(name), PreserveLCSSA(PreserveLCSSA), + IVIncInsertLoop(nullptr), IVIncInsertPos(nullptr), CanonicalMode(true), + LSRMode(false), Builder(se.getContext(), TargetFolder(DL), IRBuilderCallbackInserter( [this](Instruction *I) { rememberInstruction(I); })) { @@ -223,14 +227,18 @@ class SCEVExpander : public SCEVVisitor { const TargetTransformInfo *TTI = nullptr); /// Insert code to directly compute the specified SCEV expression into the - /// program. The inserted code is inserted into the specified block. - Value *expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I); + /// program. The code is inserted into the specified block. + Value *expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I) { + return expandCodeForImpl(SH, Ty, I, true); + } /// Insert code to directly compute the specified SCEV expression into the - /// program. The inserted code is inserted into the SCEVExpander's current + /// program. The code is inserted into the SCEVExpander's current /// insertion point. If a type is specified, the result will be expanded to /// have that type, with a cast if necessary. - Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr); + Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr) { + return expandCodeForImpl(SH, Ty, true); + } /// Generates a code sequence that evaluates this predicate. The inserted /// instructions will be at position \p Loc. The result will be of type i1 @@ -338,6 +346,20 @@ class SCEVExpander : public SCEVVisitor { private: LLVMContext &getContext() const { return SE.getContext(); } + /// Insert code to directly compute the specified SCEV expression into the + /// program. The code is inserted into the SCEVExpander's current + /// insertion point. If a type is specified, the result will be expanded to + /// have that type, with a cast if necessary. If \p Root is true, this + /// indicates that \p SH is the top-level expression to expand passed from + /// an external client call. + Value *expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root); + + /// Insert code to directly compute the specified SCEV expression into the + /// program. The code is inserted into the specified block. If \p + /// Root is true, this indicates that \p SH is the top-level expression to + /// expand passed from an external client call. + Value *expandCodeForImpl(const SCEV *SH, Type *Ty, Instruction *I, bool Root); + /// Recursive helper function for isHighCostExpansion. bool isHighCostExpansionHelper(const SCEV *S, Loop *L, const Instruction &At, int &BudgetRemaining, @@ -419,6 +441,11 @@ class SCEVExpander : public SCEVVisitor { Instruction *Pos, PHINode *LoopPhi); void fixupInsertPoints(Instruction *I); + + /// If required, create LCSSA PHIs for \p Users' operand \p OpIdx. If new + /// LCSSA PHIs have been created, return the LCSSA PHI available at \p User. + /// If no PHIs have been created, return the unchanged operand \p OpIdx. + Value *fixupLCSSAFormFor(Instruction *User, unsigned OpIdx); }; } // namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h index 46f6ca0462f8b..fb3a7490346f4 100644 --- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h @@ -25,7 +25,7 @@ struct SimplifyCFGOptions { bool ForwardSwitchCondToPhi = false; bool ConvertSwitchToLookupTable = false; bool NeedCanonicalLoop = true; - bool HoistCommonInsts = true; + bool HoistCommonInsts = false; bool SinkCommonInsts = false; bool SimplifyCondBranch = true; bool FoldTwoEntryPHINode = true; diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h index d6ee19365c729..2819a3468766d 100644 --- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -24,7 +24,6 @@ class CallInst; class DataLayout; class Instruction; class IRBuilderBase; -class TargetLibraryInfo; class Function; class OptimizationRemarkEmitter; class BlockFrequencyInfo; diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h index bb3d02b959564..4254bd71a41c0 100644 --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -92,16 +92,6 @@ bool UnrollRuntimeLoopRemainder( const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop = nullptr); -void computePeelCount(Loop *L, unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP, - TargetTransformInfo::PeelingPreferences &PP, - unsigned &TripCount, ScalarEvolution &SE); - -bool canPeel(Loop *L); - -bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA); - LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, @@ -121,7 +111,6 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, - bool &UseUpperBound); void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, @@ -138,12 +127,6 @@ TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Optional UserAllowPartial, Optional UserRuntime, Optional UserUpperBound, Optional UserFullUnrollMaxCount); -TargetTransformInfo::PeelingPreferences -gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, - const TargetTransformInfo &TTI, - Optional UserAllowPeeling, - Optional UserAllowProfileBasedPeeling); - unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index fec2415a0e459..2593fc42afdb8 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -641,8 +641,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I, if (!DT) return ModRefInfo::ModRef; - const Value *Object = - GetUnderlyingObject(MemLoc.Ptr, I->getModule()->getDataLayout()); + const Value *Object = getUnderlyingObject(MemLoc.Ptr); if (!isIdentifiedObject(Object) || isa(Object) || isa(Object)) return ModRefInfo::ModRef; diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp index 16bfd5c75902f..bc0cdc4c4c77a 100644 --- a/llvm/lib/Analysis/AssumptionCache.cpp +++ b/llvm/lib/Analysis/AssumptionCache.cpp @@ -175,7 +175,7 @@ void AssumptionCache::transferAffectedValuesInCache(Value *OV, Value *NV) { return; for (auto &A : AVI->second) - if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end()) + if (!llvm::is_contained(NAVV, A)) NAVV.push_back(A); AffectedValues.erase(OV); } diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 33f122728d2aa..2ede4baaf6832 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -66,7 +66,7 @@ using namespace llvm; /// Enable analysis of recursive PHI nodes. static cl::opt EnableRecPhiAnalysis("basic-aa-recphi", cl::Hidden, - cl::init(false)); + cl::init(true)); /// By default, even on 32-bit architectures we use 64-bit integers for /// calculations. This will allow us to more-aggressively decompose indexing @@ -91,7 +91,7 @@ STATISTIC(SearchTimes, "Number of times a GEP is decomposed"); const unsigned MaxNumPhiBBsValueReachabilityCheck = 20; // The max limit of the search depth in DecomposeGEPExpression() and -// GetUnderlyingObject(), both functions need to use the same search +// getUnderlyingObject(), both functions need to use the same search // depth otherwise the algorithm in aliasGEP will assert. static const unsigned MaxLookupSearchDepth = 6; @@ -456,8 +456,8 @@ static unsigned getMaxPointerSize(const DataLayout &DL) { /// such, the gep cannot necessarily be reconstructed from its decomposed form. /// /// When DataLayout is around, this function is capable of analyzing everything -/// that GetUnderlyingObject can look through. To be able to do that -/// GetUnderlyingObject and DecomposeGEPExpression must use the same search +/// that getUnderlyingObject can look through. To be able to do that +/// getUnderlyingObject and DecomposeGEPExpression must use the same search /// depth (MaxLookupSearchDepth). When DataLayout not is around, it just looks /// through pointer casts. bool BasicAAResult::DecomposeGEPExpression(const Value *V, @@ -661,7 +661,7 @@ bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc, SmallVector Worklist; Worklist.push_back(Loc.Ptr); do { - const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), DL); + const Value *V = getUnderlyingObject(Worklist.pop_back_val()); if (!Visited.insert(V).second) { Visited.clear(); return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal); @@ -875,7 +875,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call, assert(notDifferentParent(Call, Loc.Ptr) && "AliasAnalysis query involving multiple functions!"); - const Value *Object = GetUnderlyingObject(Loc.Ptr, DL); + const Value *Object = getUnderlyingObject(Loc.Ptr); // Calls marked 'tail' cannot read or write allocas from the current frame // because the current frame might be destroyed by the time they run. However, @@ -1309,7 +1309,7 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp, /// another pointer. /// /// We know that V1 is a GEP, but we don't know anything about V2. -/// UnderlyingV1 is GetUnderlyingObject(GEP1, DL), UnderlyingV2 is the same for +/// UnderlyingV1 is getUnderlyingObject(GEP1), UnderlyingV2 is the same for /// V2. AliasResult BasicAAResult::aliasGEP( const GEPOperator *GEP1, LocationSize V1Size, const AAMDNodes &V1AAInfo, @@ -1338,7 +1338,7 @@ AliasResult BasicAAResult::aliasGEP( assert(DecompGEP1.Base == UnderlyingV1 && DecompGEP2.Base == UnderlyingV2 && "DecomposeGEPExpression returned a result different from " - "GetUnderlyingObject"); + "getUnderlyingObject"); // If the GEP's offset relative to its base is such that the base would // fall below the start of the object underlying V2, then the GEP and V2 @@ -1782,10 +1782,10 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size, // Figure out what objects these things are pointing to if we can. if (O1 == nullptr) - O1 = GetUnderlyingObject(V1, DL, MaxLookupSearchDepth); + O1 = getUnderlyingObject(V1, MaxLookupSearchDepth); if (O2 == nullptr) - O2 = GetUnderlyingObject(V2, DL, MaxLookupSearchDepth); + O2 = getUnderlyingObject(V2, MaxLookupSearchDepth); // Null values in the default address space don't point to any object, so they // don't alias any other pointer. diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp index a396b5ad21c6a..7b24fe9d56c25 100644 --- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp +++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp @@ -148,6 +148,105 @@ static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1; /// instruction. This is essentially never taken. static const uint32_t IH_NONTAKEN_WEIGHT = 1; +BranchProbabilityInfo::SccInfo::SccInfo(const Function &F) { + // Record SCC numbers of blocks in the CFG to identify irreducible loops. + // FIXME: We could only calculate this if the CFG is known to be irreducible + // (perhaps cache this info in LoopInfo if we can easily calculate it there?). + int SccNum = 0; + for (scc_iterator It = scc_begin(&F); !It.isAtEnd(); + ++It, ++SccNum) { + // Ignore single-block SCCs since they either aren't loops or LoopInfo will + // catch them. + const std::vector &Scc = *It; + if (Scc.size() == 1) + continue; + + LLVM_DEBUG(dbgs() << "BPI: SCC " << SccNum << ":"); + for (const auto *BB : Scc) { + LLVM_DEBUG(dbgs() << " " << BB->getName()); + SccNums[BB] = SccNum; + calculateSccBlockType(BB, SccNum); + } + LLVM_DEBUG(dbgs() << "\n"); + } +} + +int BranchProbabilityInfo::SccInfo::getSCCNum(const BasicBlock *BB) const { + auto SccIt = SccNums.find(BB); + if (SccIt == SccNums.end()) + return -1; + return SccIt->second; +} + +void BranchProbabilityInfo::SccInfo::getSccEnterBlocks( + int SccNum, SmallVectorImpl &Enters) const { + + for (auto MapIt : SccBlocks[SccNum]) { + const auto *BB = MapIt.first; + if (isSCCHeader(BB, SccNum)) + for (const auto *Pred : predecessors(BB)) + if (getSCCNum(Pred) != SccNum) + Enters.push_back(const_cast(BB)); + } +} + +void BranchProbabilityInfo::SccInfo::getSccExitBlocks( + int SccNum, SmallVectorImpl &Exits) const { + for (auto MapIt : SccBlocks[SccNum]) { + const auto *BB = MapIt.first; + if (isSCCExitingBlock(BB, SccNum)) + for (const auto *Succ : successors(BB)) + if (getSCCNum(Succ) != SccNum) + Exits.push_back(const_cast(BB)); + } +} + +uint32_t BranchProbabilityInfo::SccInfo::getSccBlockType(const BasicBlock *BB, + int SccNum) const { + assert(getSCCNum(BB) == SccNum); + + assert(SccBlocks.size() > static_cast(SccNum) && "Unknown SCC"); + const auto &SccBlockTypes = SccBlocks[SccNum]; + + auto It = SccBlockTypes.find(BB); + if (It != SccBlockTypes.end()) { + return It->second; + } + return Inner; +} + +void BranchProbabilityInfo::SccInfo::calculateSccBlockType(const BasicBlock *BB, + int SccNum) { + assert(getSCCNum(BB) == SccNum); + uint32_t BlockType = Inner; + + if (llvm::any_of(make_range(pred_begin(BB), pred_end(BB)), + [&](const BasicBlock *Pred) { + // Consider any block that is an entry point to the SCC as + // a header. + return getSCCNum(Pred) != SccNum; + })) + BlockType |= Header; + + if (llvm::any_of( + make_range(succ_begin(BB), succ_end(BB)), + [&](const BasicBlock *Succ) { return getSCCNum(Succ) != SccNum; })) + BlockType |= Exiting; + + // Lazily compute the set of headers for a given SCC and cache the results + // in the SccHeaderMap. + if (SccBlocks.size() <= static_cast(SccNum)) + SccBlocks.resize(SccNum + 1); + auto &SccBlockTypes = SccBlocks[SccNum]; + + if (BlockType != Inner) { + bool IsInserted; + std::tie(std::ignore, IsInserted) = + SccBlockTypes.insert(std::make_pair(BB, BlockType)); + assert(IsInserted && "Duplicated block in SCC"); + } +} + static void UpdatePDTWorklist(const BasicBlock *BB, PostDominatorTree *PDT, SmallVectorImpl &WorkList, SmallPtrSetImpl &TargetSet) { @@ -511,38 +610,6 @@ bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) { return true; } -static int getSCCNum(const BasicBlock *BB, - const BranchProbabilityInfo::SccInfo &SccI) { - auto SccIt = SccI.SccNums.find(BB); - if (SccIt == SccI.SccNums.end()) - return -1; - return SccIt->second; -} - -// Consider any block that is an entry point to the SCC as a header. -static bool isSCCHeader(const BasicBlock *BB, int SccNum, - BranchProbabilityInfo::SccInfo &SccI) { - assert(getSCCNum(BB, SccI) == SccNum); - - // Lazily compute the set of headers for a given SCC and cache the results - // in the SccHeaderMap. - if (SccI.SccHeaders.size() <= static_cast(SccNum)) - SccI.SccHeaders.resize(SccNum + 1); - auto &HeaderMap = SccI.SccHeaders[SccNum]; - bool Inserted; - BranchProbabilityInfo::SccHeaderMap::iterator HeaderMapIt; - std::tie(HeaderMapIt, Inserted) = HeaderMap.insert(std::make_pair(BB, false)); - if (Inserted) { - bool IsHeader = llvm::any_of(make_range(pred_begin(BB), pred_end(BB)), - [&](const BasicBlock *Pred) { - return getSCCNum(Pred, SccI) != SccNum; - }); - HeaderMapIt->second = IsHeader; - return IsHeader; - } else - return HeaderMapIt->second; -} - // Compute the unlikely successors to the block BB in the loop L, specifically // those that are unlikely because this is a loop, and add them to the // UnlikelyBlocks set. @@ -625,8 +692,7 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L, // we can constant-evaluate the compare to see if it makes the branch be // taken or not. Constant *CmpLHSConst = dyn_cast(V); - if (!CmpLHSConst || - std::find(succ_begin(BB), succ_end(BB), B) == succ_end(BB)) + if (!CmpLHSConst || !llvm::is_contained(successors(BB), B)) continue; // First collapse InstChain for (Instruction *I : llvm::reverse(InstChain)) { @@ -653,12 +719,11 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L, // Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges // as taken, exiting edges as not-taken. bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB, - const LoopInfo &LI, - SccInfo &SccI) { + const LoopInfo &LI) { int SccNum; Loop *L = LI.getLoopFor(BB); if (!L) { - SccNum = getSCCNum(BB, SccI); + SccNum = SccI->getSCCNum(BB); if (SccNum < 0) return false; } @@ -685,9 +750,9 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB, else InEdges.push_back(I.getSuccessorIndex()); } else { - if (getSCCNum(*I, SccI) != SccNum) + if (SccI->getSCCNum(*I) != SccNum) ExitingEdges.push_back(I.getSuccessorIndex()); - else if (isSCCHeader(*I, SccNum, SccI)) + else if (SccI->isSCCHeader(*I, SccNum)) BackEdges.push_back(I.getSuccessorIndex()); else InEdges.push_back(I.getSuccessorIndex()); @@ -1072,26 +1137,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI, assert(PostDominatedByUnreachable.empty()); assert(PostDominatedByColdCall.empty()); - // Record SCC numbers of blocks in the CFG to identify irreducible loops. - // FIXME: We could only calculate this if the CFG is known to be irreducible - // (perhaps cache this info in LoopInfo if we can easily calculate it there?). - int SccNum = 0; - SccInfo SccI; - for (scc_iterator It = scc_begin(&F); !It.isAtEnd(); - ++It, ++SccNum) { - // Ignore single-block SCCs since they either aren't loops or LoopInfo will - // catch them. - const std::vector &Scc = *It; - if (Scc.size() == 1) - continue; - - LLVM_DEBUG(dbgs() << "BPI: SCC " << SccNum << ":"); - for (auto *BB : Scc) { - LLVM_DEBUG(dbgs() << " " << BB->getName()); - SccI.SccNums[BB] = SccNum; - } - LLVM_DEBUG(dbgs() << "\n"); - } + SccI = std::make_unique(F); std::unique_ptr PDTPtr; @@ -1119,7 +1165,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI, continue; if (calcColdCallHeuristics(BB)) continue; - if (calcLoopBranchHeuristics(BB, LI, SccI)) + if (calcLoopBranchHeuristics(BB, LI)) continue; if (calcPointerHeuristics(BB)) continue; @@ -1131,6 +1177,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI, PostDominatedByUnreachable.clear(); PostDominatedByColdCall.clear(); + SccI.reset(); if (PrintBranchProb && (PrintBranchProbFuncName.empty() || diff --git a/llvm/lib/Analysis/CGSCCPassManager.cpp b/llvm/lib/Analysis/CGSCCPassManager.cpp index fd3166f8cd0cd..a862de643c081 100644 --- a/llvm/lib/Analysis/CGSCCPassManager.cpp +++ b/llvm/lib/Analysis/CGSCCPassManager.cpp @@ -78,9 +78,6 @@ PassManagername() << " on " << *C << "\n"; - PreservedAnalyses PassPA; { TimeTraceScope TimeScope(Pass->name()); diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp index 55adb454b7338..19c128b6633c0 100644 --- a/llvm/lib/Analysis/CallGraph.cpp +++ b/llvm/lib/Analysis/CallGraph.cpp @@ -281,13 +281,37 @@ void CallGraphNode::replaceCallEdge(CallBase &Call, CallBase &NewCall, I->second = NewNode; NewNode->AddRef(); - // Refresh callback references. - forEachCallbackFunction(Call, [=](Function *CB) { - removeOneAbstractEdgeTo(CG->getOrInsertFunction(CB)); + // Refresh callback references. Do not resize CalledFunctions if the + // number of callbacks is the same for new and old call sites. + SmallVector OldCBs; + SmallVector NewCBs; + forEachCallbackFunction(Call, [this, &OldCBs](Function *CB) { + OldCBs.push_back(CG->getOrInsertFunction(CB)); }); - forEachCallbackFunction(NewCall, [=](Function *CB) { - addCalledFunction(nullptr, CG->getOrInsertFunction(CB)); + forEachCallbackFunction(NewCall, [this, &NewCBs](Function *CB) { + NewCBs.push_back(CG->getOrInsertFunction(CB)); }); + if (OldCBs.size() == NewCBs.size()) { + for (unsigned N = 0; N < OldCBs.size(); ++N) { + CallGraphNode *OldNode = OldCBs[N]; + CallGraphNode *NewNode = NewCBs[N]; + for (auto J = CalledFunctions.begin();; ++J) { + assert(J != CalledFunctions.end() && + "Cannot find callsite to update!"); + if (!J->first && J->second == OldNode) { + J->second = NewNode; + OldNode->DropRef(); + NewNode->AddRef(); + break; + } + } + } + } else { + for (auto *CGN : OldCBs) + removeOneAbstractEdgeTo(CGN); + for (auto *CGN : NewCBs) + addCalledFunction(nullptr, CGN); + } return; } } diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index 8b101e3b2cc47..6de6aaa48d4f8 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -273,7 +273,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, // The pointer is not captured if returned pointer is not captured. // NOTE: CaptureTracking users should not assume that only functions // marked with nocapture do not capture. This means that places like - // GetUnderlyingObject in ValueTracking or DecomposeGEPExpression + // getUnderlyingObject in ValueTracking or DecomposeGEPExpression // in BasicAA also need to know about this property. if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call, true)) { diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 794edeb714fd2..28a30090a0093 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -718,7 +718,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty, // If this load comes from anywhere in a constant global, and if the global // is all undef or zero, we know what it loads. - if (auto *GV = dyn_cast(GetUnderlyingObject(CE, DL))) { + if (auto *GV = dyn_cast(getUnderlyingObject(CE))) { if (GV->isConstant() && GV->hasDefinitiveInitializer()) { if (GV->getInitializer()->isNullValue()) return Constant::getNullValue(Ty); @@ -1071,6 +1071,8 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode, default: return nullptr; case Instruction::ICmp: case Instruction::FCmp: llvm_unreachable("Invalid for compares"); + case Instruction::Freeze: + return isGuaranteedNotToBeUndefOrPoison(Ops[0]) ? Ops[0] : nullptr; case Instruction::Call: if (auto *F = dyn_cast(Ops.back())) { const auto *Call = cast(InstOrCE); @@ -1434,6 +1436,11 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: case Intrinsic::masked_load: + case Intrinsic::abs: + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::ssub_with_overflow: @@ -2384,8 +2391,37 @@ static Constant *ConstantFoldScalarCall2(StringRef Name, !getConstIntOrUndef(Operands[1], C1)) return nullptr; + unsigned BitWidth = Ty->getScalarSizeInBits(); switch (IntrinsicID) { default: break; + case Intrinsic::smax: + if (!C0 && !C1) + return UndefValue::get(Ty); + if (!C0 || !C1) + return ConstantInt::get(Ty, APInt::getSignedMaxValue(BitWidth)); + return ConstantInt::get(Ty, C0->sgt(*C1) ? *C0 : *C1); + + case Intrinsic::smin: + if (!C0 && !C1) + return UndefValue::get(Ty); + if (!C0 || !C1) + return ConstantInt::get(Ty, APInt::getSignedMinValue(BitWidth)); + return ConstantInt::get(Ty, C0->slt(*C1) ? *C0 : *C1); + + case Intrinsic::umax: + if (!C0 && !C1) + return UndefValue::get(Ty); + if (!C0 || !C1) + return ConstantInt::get(Ty, APInt::getMaxValue(BitWidth)); + return ConstantInt::get(Ty, C0->ugt(*C1) ? *C0 : *C1); + + case Intrinsic::umin: + if (!C0 && !C1) + return UndefValue::get(Ty); + if (!C0 || !C1) + return ConstantInt::get(Ty, APInt::getMinValue(BitWidth)); + return ConstantInt::get(Ty, C0->ult(*C1) ? *C0 : *C1); + case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: case Intrinsic::uadd_with_overflow: @@ -2470,6 +2506,18 @@ static Constant *ConstantFoldScalarCall2(StringRef Name, return ConstantInt::get(Ty, C0->countTrailingZeros()); else return ConstantInt::get(Ty, C0->countLeadingZeros()); + + case Intrinsic::abs: + // Undef or minimum val operand with poison min --> undef + assert(C1 && "Must be constant int"); + if (C1->isOneValue() && (!C0 || C0->isMinSignedValue())) + return UndefValue::get(Ty); + + // Undef operand with no poison min --> 0 (sign bit must be clear) + if (C1->isNullValue() && !C0) + return Constant::getNullValue(Ty); + + return ConstantInt::get(Ty, C0->abs()); } return nullptr; diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index bcfeef7fb8abc..9b108e703e513 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -659,8 +659,8 @@ static AliasResult underlyingObjectsAlias(AAResults *AA, return NoAlias; // Check the underlying objects are the same - const Value *AObj = GetUnderlyingObject(LocA.Ptr, DL); - const Value *BObj = GetUnderlyingObject(LocB.Ptr, DL); + const Value *AObj = getUnderlyingObject(LocA.Ptr); + const Value *BObj = getUnderlyingObject(LocB.Ptr); // If the underlying objects are the same, they must alias if (AObj == BObj) diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp index ba9343543dbbc..32fd16810dc35 100644 --- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp +++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp @@ -298,35 +298,12 @@ class ModelUnderTrainingRunner final : public MLModelRunner { private: std::unique_ptr Evaluator; - // The training framework needs some additional features, that just need to - // be set to 0. - struct TensorSpec { - std::string Name; - std::function &Dim)> - Initializer; - }; - + // The training framework needs some additional features. const std::vector TrainingOnlyFeatures{ - {"inlining_default", - [](TFModelEvaluator *Evaluator, size_t Index, - const std::vector &Dim) { - Evaluator->initInput(Index, Dim); - }}, - {"discount", - [](TFModelEvaluator *Evaluator, size_t Index, - const std::vector &Dim) { - Evaluator->initInput(Index, Dim); - }}, - {"reward", - [](TFModelEvaluator *Evaluator, size_t Index, - const std::vector &Dim) { - Evaluator->initInput(Index, Dim); - }}, - {"step_type", [](TFModelEvaluator *Evaluator, size_t Index, - const std::vector &Dim) { - Evaluator->initInput(Index, Dim); - }}}; + TensorSpec::createSpec(TFFeedPrefix + "inlining_default", {1}), + TensorSpec::createSpec(TFFeedPrefix + "discount", {1}), + TensorSpec::createSpec(TFFeedPrefix + "reward", {1}), + TensorSpec::createSpec(TFFeedPrefix + "step_type", {1})}; }; } // namespace @@ -409,33 +386,22 @@ size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() { ModelUnderTrainingRunner::ModelUnderTrainingRunner(LLVMContext &Ctx, const std::string &ModelPath) : MLModelRunner(Ctx) { - std::vector InputNames; - std::vector OutputNames; + std::vector InputSpecs; + std::vector OutputSpecs; for (size_t I = 0; I < NumberOfFeatures; ++I) - InputNames.push_back(TFFeedPrefix + FeatureNameMap[I]); - for (size_t I = 0; I < TrainingOnlyFeatures.size(); ++I) - InputNames.push_back(TFFeedPrefix + TrainingOnlyFeatures[I].Name); - OutputNames.push_back(TFDecisionName); + InputSpecs.push_back( + TensorSpec::createSpec(TFFeedPrefix + FeatureNameMap[I], {1})); + InputSpecs.insert(InputSpecs.end(), TrainingOnlyFeatures.begin(), + TrainingOnlyFeatures.end()); + OutputSpecs.push_back(TensorSpec::createSpec(TFDecisionName, {1})); Evaluator = - std::make_unique(ModelPath, InputNames, OutputNames); + std::make_unique(ModelPath, InputSpecs, OutputSpecs); if (!Evaluator || !Evaluator->isValid()) { Ctx.emitError("Failed to create inliner saved model evaluator"); Evaluator.reset(); return; } - - static const std::vector Dim{1}; - - size_t InputIndex = 0; - for (; InputIndex < NumberOfFeatures; ++InputIndex) { - Evaluator->initInput(InputIndex, Dim); - } - - for (; InputIndex < InputNames.size(); ++InputIndex) { - TrainingOnlyFeatures[InputIndex - NumberOfFeatures].Initializer( - Evaluator.get(), InputIndex, Dim); - } } bool ModelUnderTrainingRunner::run() { diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp index a25471ae9600b..bf9e475b0d753 100644 --- a/llvm/lib/Analysis/GlobalsModRef.cpp +++ b/llvm/lib/Analysis/GlobalsModRef.cpp @@ -44,7 +44,7 @@ STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects"); // An option to enable unsafe alias results from the GlobalsModRef analysis. // When enabled, GlobalsModRef will provide no-alias results which in extremely // rare cases may not be conservatively correct. In particular, in the face of -// transforms which cause assymetry between how effective GetUnderlyingObject +// transforms which cause assymetry between how effective getUnderlyingObject // is for two pointers, it may produce incorrect results. // // These unsafe results have been returned by GMR for many years without @@ -435,8 +435,7 @@ bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) { continue; // Check the value being stored. - Value *Ptr = GetUnderlyingObject(SI->getOperand(0), - GV->getParent()->getDataLayout()); + Value *Ptr = getUnderlyingObject(SI->getOperand(0)); if (!isAllocLikeFn(Ptr, &GetTLI(*SI->getFunction()))) return false; // Too hard to analyze. @@ -672,12 +671,12 @@ static bool isNonEscapingGlobalNoAliasWithLoad(const GlobalValue *GV, return false; if (auto *LI = dyn_cast(Input)) { - Inputs.push_back(GetUnderlyingObject(LI->getPointerOperand(), DL)); + Inputs.push_back(getUnderlyingObject(LI->getPointerOperand())); continue; } if (auto *SI = dyn_cast(Input)) { - const Value *LHS = GetUnderlyingObject(SI->getTrueValue(), DL); - const Value *RHS = GetUnderlyingObject(SI->getFalseValue(), DL); + const Value *LHS = getUnderlyingObject(SI->getTrueValue()); + const Value *RHS = getUnderlyingObject(SI->getFalseValue()); if (Visited.insert(LHS).second) Inputs.push_back(LHS); if (Visited.insert(RHS).second) @@ -686,7 +685,7 @@ static bool isNonEscapingGlobalNoAliasWithLoad(const GlobalValue *GV, } if (auto *PN = dyn_cast(Input)) { for (const Value *Op : PN->incoming_values()) { - Op = GetUnderlyingObject(Op, DL); + Op = getUnderlyingObject(Op); if (Visited.insert(Op).second) Inputs.push_back(Op); } @@ -785,7 +784,7 @@ bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV, if (auto *LI = dyn_cast(Input)) { // A pointer loaded from a global would have been captured, and we know // that the global is non-escaping, so no alias. - const Value *Ptr = GetUnderlyingObject(LI->getPointerOperand(), DL); + const Value *Ptr = getUnderlyingObject(LI->getPointerOperand()); if (isNonEscapingGlobalNoAliasWithLoad(GV, Ptr, Depth, DL)) // The load does not alias with GV. continue; @@ -793,8 +792,8 @@ bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV, return false; } if (auto *SI = dyn_cast(Input)) { - const Value *LHS = GetUnderlyingObject(SI->getTrueValue(), DL); - const Value *RHS = GetUnderlyingObject(SI->getFalseValue(), DL); + const Value *LHS = getUnderlyingObject(SI->getTrueValue()); + const Value *RHS = getUnderlyingObject(SI->getFalseValue()); if (Visited.insert(LHS).second) Inputs.push_back(LHS); if (Visited.insert(RHS).second) @@ -803,7 +802,7 @@ bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV, } if (auto *PN = dyn_cast(Input)) { for (const Value *Op : PN->incoming_values()) { - Op = GetUnderlyingObject(Op, DL); + Op = getUnderlyingObject(Op); if (Visited.insert(Op).second) Inputs.push_back(Op); } @@ -838,8 +837,8 @@ AliasResult GlobalsAAResult::alias(const MemoryLocation &LocA, const MemoryLocation &LocB, AAQueryInfo &AAQI) { // Get the base object these pointers point to. - const Value *UV1 = GetUnderlyingObject(LocA.Ptr, DL); - const Value *UV2 = GetUnderlyingObject(LocB.Ptr, DL); + const Value *UV1 = getUnderlyingObject(LocA.Ptr); + const Value *UV2 = getUnderlyingObject(LocB.Ptr); // If either of the underlying values is a global, they may be non-addr-taken // globals, which we can answer queries about. @@ -926,7 +925,7 @@ ModRefInfo GlobalsAAResult::getModRefInfoForArgument(const CallBase *Call, // is based on GV, return the conservative result. for (auto &A : Call->args()) { SmallVector Objects; - GetUnderlyingObjects(A, Objects, DL); + getUnderlyingObjects(A, Objects); // All objects must be identified. if (!all_of(Objects, isIdentifiedObject) && @@ -953,7 +952,7 @@ ModRefInfo GlobalsAAResult::getModRefInfo(const CallBase *Call, // If we are asking for mod/ref info of a direct call with a pointer to a // global we are tracking, return information if we have it. if (const GlobalValue *GV = - dyn_cast(GetUnderlyingObject(Loc.Ptr, DL))) + dyn_cast(getUnderlyingObject(Loc.Ptr))) // If GV is internal to this IR and there is no function with local linkage // that has had their address taken, keep looking for a tighter ModRefInfo. if (GV->hasLocalLinkage() && !UnknownFunctionsWithLocalLinkage) diff --git a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp index cffdbe4116086..dc426aaccb22a 100644 --- a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp +++ b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp @@ -244,19 +244,18 @@ InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() { if (!isEvaluatorRequested()) { return; } - std::vector InputNames{"serving_default_input_1"}; - std::vector OutputName{"StatefulPartitionedCall"}; + std::vector InputSpecs{TensorSpec::createSpec( + "serving_default_input_1", + {1, static_cast( + IRToNativeSizeLearning::FunctionFeatures::FeatureCount)})}; + std::vector OutputSpecs{ + TensorSpec::createSpec("StatefulPartitionedCall", {1})}; Evaluator = std::make_unique( - TFIR2NativeModelPath.getValue().c_str(), InputNames, OutputName); + TFIR2NativeModelPath.getValue().c_str(), InputSpecs, OutputSpecs); if (!Evaluator || !Evaluator->isValid()) { Evaluator.reset(); return; } - static const std::vector Dim{ - 1, static_cast( - IRToNativeSizeLearning::FunctionFeatures::FeatureCount)}; - - Evaluator->initInput(0, Dim); } InlineSizeEstimatorAnalysis::Result diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 49346860f251e..6b8f8e3acc179 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2524,8 +2524,8 @@ computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI, // memory within the lifetime of the current function (allocas, byval // arguments, globals), then determine the comparison result here. SmallVector LHSUObjs, RHSUObjs; - GetUnderlyingObjects(LHS, LHSUObjs, DL); - GetUnderlyingObjects(RHS, RHSUObjs, DL); + getUnderlyingObjects(LHS, LHSUObjs); + getUnderlyingObjects(RHS, RHSUObjs); // Is the set of underlying objects all noalias calls? auto IsNAC = [](ArrayRef Objects) { @@ -2732,7 +2732,7 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, } const APInt *C; - if (!match(RHS, m_APInt(C))) + if (!match(RHS, m_APIntAllowUndef(C))) return nullptr; // Rule out tautological comparisons (eg., ult 0 or uge 0). @@ -2753,14 +2753,87 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, return nullptr; } +static Value *simplifyICmpWithBinOpOnLHS( + CmpInst::Predicate Pred, BinaryOperator *LBO, Value *RHS, + const SimplifyQuery &Q, unsigned MaxRecurse) { + Type *ITy = GetCompareTy(RHS); // The return type. + + Value *Y = nullptr; + // icmp pred (or X, Y), X + if (match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) { + if (Pred == ICmpInst::ICMP_ULT) + return getFalse(ITy); + if (Pred == ICmpInst::ICMP_UGE) + return getTrue(ITy); + + if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) { + KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (RHSKnown.isNonNegative() && YKnown.isNegative()) + return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy); + if (RHSKnown.isNegative() || YKnown.isNonNegative()) + return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy); + } + } + + // icmp pred (and X, Y), X + if (match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) { + if (Pred == ICmpInst::ICMP_UGT) + return getFalse(ITy); + if (Pred == ICmpInst::ICMP_ULE) + return getTrue(ITy); + } + + // icmp pred (urem X, Y), Y + if (match(LBO, m_URem(m_Value(), m_Specific(RHS)))) { + switch (Pred) { + default: + break; + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SGE: { + KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (!Known.isNonNegative()) + break; + LLVM_FALLTHROUGH; + } + case ICmpInst::ICMP_EQ: + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: + return getFalse(ITy); + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_SLE: { + KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (!Known.isNonNegative()) + break; + LLVM_FALLTHROUGH; + } + case ICmpInst::ICMP_NE: + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: + return getTrue(ITy); + } + } + + // x >> y <=u x + // x udiv y <=u x. + if (match(LBO, m_LShr(m_Specific(RHS), m_Value())) || + match(LBO, m_UDiv(m_Specific(RHS), m_Value()))) { + // icmp pred (X op Y), X + if (Pred == ICmpInst::ICMP_UGT) + return getFalse(ITy); + if (Pred == ICmpInst::ICMP_ULE) + return getTrue(ITy); + } + + return nullptr; +} + /// TODO: A large part of this logic is duplicated in InstCombine's /// foldICmpBinOp(). We should be able to share that and avoid the code /// duplication. static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { - Type *ITy = GetCompareTy(LHS); // The return type. - BinaryOperator *LBO = dyn_cast(LHS); BinaryOperator *RBO = dyn_cast(RHS); if (MaxRecurse && (LBO || RBO)) { @@ -2831,56 +2904,14 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, } } - { - Value *Y = nullptr; - // icmp pred (or X, Y), X - if (LBO && match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) { - if (Pred == ICmpInst::ICMP_ULT) - return getFalse(ITy); - if (Pred == ICmpInst::ICMP_UGE) - return getTrue(ITy); - - if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) { - KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (RHSKnown.isNonNegative() && YKnown.isNegative()) - return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy); - if (RHSKnown.isNegative() || YKnown.isNonNegative()) - return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy); - } - } - // icmp pred X, (or X, Y) - if (RBO && match(RBO, m_c_Or(m_Value(Y), m_Specific(LHS)))) { - if (Pred == ICmpInst::ICMP_ULE) - return getTrue(ITy); - if (Pred == ICmpInst::ICMP_UGT) - return getFalse(ITy); - - if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) { - KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (LHSKnown.isNonNegative() && YKnown.isNegative()) - return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy); - if (LHSKnown.isNegative() || YKnown.isNonNegative()) - return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy); - } - } - } + if (LBO) + if (Value *V = simplifyICmpWithBinOpOnLHS(Pred, LBO, RHS, Q, MaxRecurse)) + return V; - // icmp pred (and X, Y), X - if (LBO && match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) { - if (Pred == ICmpInst::ICMP_UGT) - return getFalse(ITy); - if (Pred == ICmpInst::ICMP_ULE) - return getTrue(ITy); - } - // icmp pred X, (and X, Y) - if (RBO && match(RBO, m_c_And(m_Value(), m_Specific(LHS)))) { - if (Pred == ICmpInst::ICMP_UGE) - return getTrue(ITy); - if (Pred == ICmpInst::ICMP_ULT) - return getFalse(ITy); - } + if (RBO) + if (Value *V = simplifyICmpWithBinOpOnLHS( + ICmpInst::getSwappedPredicate(Pred), RBO, LHS, Q, MaxRecurse)) + return V; // 0 - (zext X) pred C if (!CmpInst::isUnsigned(Pred) && match(LHS, m_Neg(m_ZExt(m_Value())))) { @@ -2904,88 +2935,6 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, } } - // icmp pred (urem X, Y), Y - if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) { - switch (Pred) { - default: - break; - case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: { - KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (!Known.isNonNegative()) - break; - LLVM_FALLTHROUGH; - } - case ICmpInst::ICMP_EQ: - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_UGE: - return getFalse(ITy); - case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: { - KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (!Known.isNonNegative()) - break; - LLVM_FALLTHROUGH; - } - case ICmpInst::ICMP_NE: - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_ULE: - return getTrue(ITy); - } - } - - // icmp pred X, (urem Y, X) - if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) { - switch (Pred) { - default: - break; - case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: { - KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (!Known.isNonNegative()) - break; - LLVM_FALLTHROUGH; - } - case ICmpInst::ICMP_NE: - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_UGE: - return getTrue(ITy); - case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: { - KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (!Known.isNonNegative()) - break; - LLVM_FALLTHROUGH; - } - case ICmpInst::ICMP_EQ: - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_ULE: - return getFalse(ITy); - } - } - - // x >> y <=u x - // x udiv y <=u x. - if (LBO && (match(LBO, m_LShr(m_Specific(RHS), m_Value())) || - match(LBO, m_UDiv(m_Specific(RHS), m_Value())))) { - // icmp pred (X op Y), X - if (Pred == ICmpInst::ICMP_UGT) - return getFalse(ITy); - if (Pred == ICmpInst::ICMP_ULE) - return getTrue(ITy); - } - - // x >=u x >> y - // x >=u x udiv y. - if (RBO && (match(RBO, m_LShr(m_Specific(LHS), m_Value())) || - match(RBO, m_UDiv(m_Specific(LHS), m_Value())))) { - // icmp pred X, (X op Y) - if (Pred == ICmpInst::ICMP_ULT) - return getFalse(ITy); - if (Pred == ICmpInst::ICMP_UGE) - return getTrue(ITy); - } - // handle: // CI2 << X == CI // CI2 << X != CI @@ -3701,6 +3650,14 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, // LHS != Inf if (Pred == FCmpInst::FCMP_UNE && isKnownNeverInfinity(LHS, Q.TLI)) return getTrue(RetTy); + // LHS == Inf || LHS == NaN + if (Pred == FCmpInst::FCMP_UEQ && isKnownNeverInfinity(LHS, Q.TLI) && + isKnownNeverNaN(LHS, Q.TLI)) + return getFalse(RetTy); + // LHS != Inf && LHS != NaN + if (Pred == FCmpInst::FCMP_ONE && isKnownNeverInfinity(LHS, Q.TLI) && + isKnownNeverNaN(LHS, Q.TLI)) + return getTrue(RetTy); } if (C->isNegative() && !C->isNegZero()) { assert(!C->isNaN() && "Unexpected NaN constant!"); @@ -5241,11 +5198,112 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0, return nullptr; } +static Intrinsic::ID getMaxMinOpposite(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::smax: return Intrinsic::smin; + case Intrinsic::smin: return Intrinsic::smax; + case Intrinsic::umax: return Intrinsic::umin; + case Intrinsic::umin: return Intrinsic::umax; + default: llvm_unreachable("Unexpected intrinsic"); + } +} + +static APInt getMaxMinLimit(Intrinsic::ID ID, unsigned BitWidth) { + switch (ID) { + case Intrinsic::smax: return APInt::getSignedMaxValue(BitWidth); + case Intrinsic::smin: return APInt::getSignedMinValue(BitWidth); + case Intrinsic::umax: return APInt::getMaxValue(BitWidth); + case Intrinsic::umin: return APInt::getMinValue(BitWidth); + default: llvm_unreachable("Unexpected intrinsic"); + } +} + static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, const SimplifyQuery &Q) { Intrinsic::ID IID = F->getIntrinsicID(); Type *ReturnType = F->getReturnType(); + unsigned BitWidth = ReturnType->getScalarSizeInBits(); switch (IID) { + case Intrinsic::abs: + // abs(abs(x)) -> abs(x). We don't need to worry about the nsw arg here. + // It is always ok to pick the earlier abs. We'll just lose nsw if its only + // on the outer abs. + if (match(Op0, m_Intrinsic(m_Value(), m_Value()))) + return Op0; + // If the sign bit is clear already, then abs does not do anything. + if (isKnownNonNegative(Op0, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) + return Op0; + break; + + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: { + // If the arguments are the same, this is a no-op. + if (Op0 == Op1) + return Op0; + + // Canonicalize constant operand as Op1. + if (isa(Op0)) + std::swap(Op0, Op1); + + // Assume undef is the limit value. + if (isa(Op1)) + return ConstantInt::get(ReturnType, getMaxMinLimit(IID, BitWidth)); + + auto hasSpecificOperand = [](IntrinsicInst *II, Value *V) { + return II->getOperand(0) == V || II->getOperand(1) == V; + }; + + // For 4 commuted variants of each intrinsic: + // max (max X, Y), X --> max X, Y + // max (min X, Y), X --> X + if (auto *MinMax0 = dyn_cast(Op0)) { + Intrinsic::ID InnerID = MinMax0->getIntrinsicID(); + if (InnerID == IID && hasSpecificOperand(MinMax0, Op1)) + return MinMax0; + if (InnerID == getMaxMinOpposite(IID) && hasSpecificOperand(MinMax0, Op1)) + return Op1; + } + if (auto *MinMax1 = dyn_cast(Op1)) { + Intrinsic::ID InnerID = MinMax1->getIntrinsicID(); + if (InnerID == IID && hasSpecificOperand(MinMax1, Op0)) + return MinMax1; + if (InnerID == getMaxMinOpposite(IID) && hasSpecificOperand(MinMax1, Op0)) + return Op0; + } + + const APInt *C; + if (match(Op1, m_APIntAllowUndef(C))) { + // Clamp to limit value. For example: + // umax(i8 %x, i8 255) --> 255 + if (*C == getMaxMinLimit(IID, BitWidth)) + return ConstantInt::get(ReturnType, *C); + + // If the constant op is the opposite of the limit value, the other must + // be larger/smaller or equal. For example: + // umin(i8 %x, i8 255) --> %x + if (*C == getMaxMinLimit(getMaxMinOpposite(IID), BitWidth)) + return Op0; + + // Remove nested call if constant operands allow it. Example: + // max (max X, 7), 5 -> max X, 7 + auto *MinMax0 = dyn_cast(Op0); + if (MinMax0 && MinMax0->getIntrinsicID() == IID) { + // TODO: loosen undef/splat restrictions for vector constants. + Value *M00 = MinMax0->getOperand(0), *M01 = MinMax0->getOperand(1); + const APInt *InnerC; + if ((match(M00, m_APInt(InnerC)) || match(M01, m_APInt(InnerC))) && + ((IID == Intrinsic::smax && InnerC->sge(*C)) || + (IID == Intrinsic::smin && InnerC->sle(*C)) || + (IID == Intrinsic::umax && InnerC->uge(*C)) || + (IID == Intrinsic::umin && InnerC->ule(*C)))) + return Op0; + } + } + + break; + } case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: // X - X -> { 0, false } @@ -5452,28 +5510,9 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) { } } -Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) { - Value *Callee = Call->getCalledOperand(); - - // musttail calls can only be simplified if they are also DCEd. - // As we can't guarantee this here, don't simplify them. - if (Call->isMustTailCall()) - return nullptr; - - // call undef -> undef - // call null -> undef - if (isa(Callee) || isa(Callee)) - return UndefValue::get(Call->getType()); - - Function *F = dyn_cast(Callee); - if (!F) - return nullptr; - - if (F->isIntrinsic()) - if (Value *Ret = simplifyIntrinsic(Call, Q)) - return Ret; - - if (!canConstantFoldCallTo(Call, F)) +static Value *tryConstantFoldCall(CallBase *Call, const SimplifyQuery &Q) { + auto *F = dyn_cast(Call->getCalledOperand()); + if (!F || !canConstantFoldCallTo(Call, F)) return nullptr; SmallVector ConstantArgs; @@ -5492,6 +5531,29 @@ Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) { return ConstantFoldCall(Call, F, ConstantArgs, Q.TLI); } +Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) { + // musttail calls can only be simplified if they are also DCEd. + // As we can't guarantee this here, don't simplify them. + if (Call->isMustTailCall()) + return nullptr; + + // call undef -> undef + // call null -> undef + Value *Callee = Call->getCalledOperand(); + if (isa(Callee) || isa(Callee)) + return UndefValue::get(Call->getType()); + + if (Value *V = tryConstantFoldCall(Call, Q)) + return V; + + auto *F = dyn_cast(Callee); + if (F && F->isIntrinsic()) + if (Value *Ret = simplifyIntrinsic(Call, Q)) + return Ret; + + return nullptr; +} + /// Given operands for a Freeze, see if we can fold the result. static Value *SimplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) { // Use a utility function defined in ValueTracking. diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index f5ffa7286b3b8..a95d1bca2fb51 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -388,8 +388,8 @@ class LazyValueInfoImpl { BasicBlock *BB); Optional solveBlockValueSelect(SelectInst *S, BasicBlock *BB); - Optional getRangeForOperand(unsigned Op, Instruction *I, - BasicBlock *BB); + Optional getRangeFor(Value *V, Instruction *CxtI, + BasicBlock *BB); Optional solveBlockValueBinaryOpImpl( Instruction *I, BasicBlock *BB, std::function solveBlockValueOverflowIntrinsic( WithOverflowInst *WO, BasicBlock *BB); - Optional solveBlockValueSaturatingIntrinsic( - SaturatingInst *SI, BasicBlock *BB); Optional solveBlockValueIntrinsic(IntrinsicInst *II, BasicBlock *BB); Optional solveBlockValueExtractValue( @@ -608,13 +606,11 @@ Optional LazyValueInfoImpl::solveBlockValueImpl( static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) { if (LoadInst *L = dyn_cast(I)) { return L->getPointerAddressSpace() == 0 && - GetUnderlyingObject(L->getPointerOperand(), - L->getModule()->getDataLayout()) == Ptr; + getUnderlyingObject(L->getPointerOperand()) == Ptr; } if (StoreInst *S = dyn_cast(I)) { return S->getPointerAddressSpace() == 0 && - GetUnderlyingObject(S->getPointerOperand(), - S->getModule()->getDataLayout()) == Ptr; + getUnderlyingObject(S->getPointerOperand()) == Ptr; } if (MemIntrinsic *MI = dyn_cast(I)) { if (MI->isVolatile()) return false; @@ -624,13 +620,11 @@ static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) { if (!Len || Len->isZero()) return false; if (MI->getDestAddressSpace() == 0) - if (GetUnderlyingObject(MI->getRawDest(), - MI->getModule()->getDataLayout()) == Ptr) + if (getUnderlyingObject(MI->getRawDest()) == Ptr) return true; if (MemTransferInst *MTI = dyn_cast(MI)) if (MTI->getSourceAddressSpace() == 0) - if (GetUnderlyingObject(MTI->getRawSource(), - MTI->getModule()->getDataLayout()) == Ptr) + if (getUnderlyingObject(MTI->getRawSource()) == Ptr) return true; } return false; @@ -643,11 +637,10 @@ static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) { static bool isObjectDereferencedInBlock(Value *Val, BasicBlock *BB) { assert(Val->getType()->isPointerTy()); - const DataLayout &DL = BB->getModule()->getDataLayout(); - Value *UnderlyingVal = GetUnderlyingObject(Val, DL); - // If 'GetUnderlyingObject' didn't converge, skip it. It won't converge + Value *UnderlyingVal = getUnderlyingObject(Val); + // If 'getUnderlyingObject' didn't converge, skip it. It won't converge // inside InstructionDereferencesPointer either. - if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, DL, 1)) + if (UnderlyingVal == getUnderlyingObject(UnderlyingVal, 1)) for (Instruction &I : *BB) if (InstructionDereferencesPointer(&I, UnderlyingVal)) return true; @@ -919,20 +912,19 @@ Optional LazyValueInfoImpl::solveBlockValueSelect( return Result; } -Optional LazyValueInfoImpl::getRangeForOperand(unsigned Op, - Instruction *I, - BasicBlock *BB) { - Optional OptVal = getBlockValue(I->getOperand(Op), BB); +Optional LazyValueInfoImpl::getRangeFor(Value *V, + Instruction *CxtI, + BasicBlock *BB) { + Optional OptVal = getBlockValue(V, BB); if (!OptVal) return None; ValueLatticeElement &Val = *OptVal; - intersectAssumeOrGuardBlockValueConstantRange(I->getOperand(Op), Val, I); + intersectAssumeOrGuardBlockValueConstantRange(V, Val, CxtI); if (Val.isConstantRange()) return Val.getConstantRange(); - const unsigned OperandBitWidth = - DL.getTypeSizeInBits(I->getOperand(Op)->getType()); + const unsigned OperandBitWidth = DL.getTypeSizeInBits(V->getType()); return ConstantRange::getFull(OperandBitWidth); } @@ -962,7 +954,7 @@ Optional LazyValueInfoImpl::solveBlockValueCast( // Figure out the range of the LHS. If that fails, we still apply the // transfer rule on the full set since we may be able to locally infer // interesting facts. - Optional LHSRes = getRangeForOperand(0, CI, BB); + Optional LHSRes = getRangeFor(CI->getOperand(0), CI, BB); if (!LHSRes.hasValue()) // More work to do before applying this transfer rule. return None; @@ -985,8 +977,8 @@ Optional LazyValueInfoImpl::solveBlockValueBinaryOpImpl( // conservative range, but apply the transfer rule anyways. This // lets us pick up facts from expressions like "and i32 (call i32 // @foo()), 32" - Optional LHSRes = getRangeForOperand(0, I, BB); - Optional RHSRes = getRangeForOperand(1, I, BB); + Optional LHSRes = getRangeFor(I->getOperand(0), I, BB); + Optional RHSRes = getRangeFor(I->getOperand(1), I, BB); if (!LHSRes.hasValue() || !RHSRes.hasValue()) // More work to do before applying this transfer rule. return None; @@ -1036,43 +1028,24 @@ LazyValueInfoImpl::solveBlockValueOverflowIntrinsic(WithOverflowInst *WO, }); } -Optional -LazyValueInfoImpl::solveBlockValueSaturatingIntrinsic(SaturatingInst *SI, - BasicBlock *BB) { - switch (SI->getIntrinsicID()) { - case Intrinsic::uadd_sat: - return solveBlockValueBinaryOpImpl( - SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) { - return CR1.uadd_sat(CR2); - }); - case Intrinsic::usub_sat: - return solveBlockValueBinaryOpImpl( - SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) { - return CR1.usub_sat(CR2); - }); - case Intrinsic::sadd_sat: - return solveBlockValueBinaryOpImpl( - SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) { - return CR1.sadd_sat(CR2); - }); - case Intrinsic::ssub_sat: - return solveBlockValueBinaryOpImpl( - SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) { - return CR1.ssub_sat(CR2); - }); - default: - llvm_unreachable("All llvm.sat intrinsic are handled."); - } -} - Optional LazyValueInfoImpl::solveBlockValueIntrinsic( IntrinsicInst *II, BasicBlock *BB) { - if (auto *SI = dyn_cast(II)) - return solveBlockValueSaturatingIntrinsic(SI, BB); + if (!ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) { + LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() + << "' - overdefined (unknown intrinsic).\n"); + return ValueLatticeElement::getOverdefined(); + } - LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName() - << "' - overdefined (unknown intrinsic).\n"); - return ValueLatticeElement::getOverdefined(); + SmallVector OpRanges; + for (Value *Op : II->args()) { + Optional Range = getRangeFor(Op, II, BB); + if (!Range) + return None; + OpRanges.push_back(*Range); + } + + return ValueLatticeElement::getRange( + ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges)); } Optional LazyValueInfoImpl::solveBlockValueExtractValue( diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 38017530c3227..4a159d6035f0d 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -673,7 +673,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk, // TODO: Look through eliminable cast pairs. // TODO: Look through calls with unique return values. // TODO: Look through vector insert/extract/shuffle. - V = OffsetOk ? GetUnderlyingObject(V, *DL) : V->stripPointerCasts(); + V = OffsetOk ? getUnderlyingObject(V) : V->stripPointerCasts(); if (LoadInst *L = dyn_cast(V)) { BasicBlock::iterator BBI = L->getIterator(); BasicBlock *BB = L->getParent(); diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index ae282a7a10952..85f658619e41e 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -393,7 +393,10 @@ void RuntimePointerChecking::groupChecks( // equivalence class, the iteration order is deterministic. for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end(); MI != ME; ++MI) { - unsigned Pointer = PositionMap[MI->getPointer()]; + auto PointerI = PositionMap.find(MI->getPointer()); + assert(PointerI != PositionMap.end() && + "pointer in equivalence class not found in PositionMap"); + unsigned Pointer = PointerI->second; bool Merged = false; // Mark this pointer as seen. Seen.insert(Pointer); @@ -505,10 +508,10 @@ class AccessAnalysis { typedef PointerIntPair MemAccessInfo; typedef SmallVector MemAccessInfoList; - AccessAnalysis(const DataLayout &Dl, Loop *TheLoop, AAResults *AA, - LoopInfo *LI, MemoryDepChecker::DepCandidates &DA, + AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI, + MemoryDepChecker::DepCandidates &DA, PredicatedScalarEvolution &PSE) - : DL(Dl), TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), + : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false), PSE(PSE) {} /// Register a load and whether it is only read from. @@ -582,8 +585,6 @@ class AccessAnalysis { /// Set of all accesses. PtrAccessSet Accesses; - const DataLayout &DL; - /// The loop being checked. const Loop *TheLoop; @@ -726,52 +727,55 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, SmallVector Retries; + // First, count how many write and read accesses are in the alias set. Also + // collect MemAccessInfos for later. + SmallVector AccessInfos; for (auto A : AS) { Value *Ptr = A.getValue(); bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); - MemAccessInfo Access(Ptr, IsWrite); if (IsWrite) ++NumWritePtrChecks; else ++NumReadPtrChecks; + AccessInfos.emplace_back(Ptr, IsWrite); + } + + // We do not need runtime checks for this alias set, if there are no writes + // or a single write and no reads. + if (NumWritePtrChecks == 0 || + (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) { + assert((AS.size() <= 1 || + all_of(AS, + [this](auto AC) { + MemAccessInfo AccessWrite(AC.getValue(), true); + return DepCands.findValue(AccessWrite) == DepCands.end(); + })) && + "Can only skip updating CanDoRT below, if all entries in AS " + "are reads or there is at most 1 entry"); + continue; + } + for (auto &Access : AccessInfos) { if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop, RunningDepId, ASId, ShouldCheckWrap, false)) { - LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n'); + LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" + << *Access.getPointer() << '\n'); Retries.push_back(Access); CanDoAliasSetRT = false; } } - // If we have at least two writes or one write and a read then we need to - // check them. But there is no need to checks if there is only one - // dependence set for this alias set. - // // Note that this function computes CanDoRT and MayNeedRTCheck // independently. For example CanDoRT=false, MayNeedRTCheck=false means that // we have a pointer for which we couldn't find the bounds but we don't // actually need to emit any checks so it does not matter. - bool NeedsAliasSetRTCheck = false; - if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2)) { - NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 || - (NumReadPtrChecks >= 1 && NumWritePtrChecks >= 1)); - // For alias sets without at least 2 writes or 1 write and 1 read, there - // is no need to generate RT checks and CanDoAliasSetRT for this alias set - // does not impact whether runtime checks can be generated. - if (!NeedsAliasSetRTCheck) { - assert((AS.size() <= 1 || - all_of(AS, - [this](auto AC) { - MemAccessInfo AccessWrite(AC.getValue(), true); - return DepCands.findValue(AccessWrite) == - DepCands.end(); - })) && - "Can only skip updating CanDoRT below, if all entries in AS " - "are reads or there is at most 1 entry"); - continue; - } - } + // + // We need runtime checks for this alias set, if there are at least 2 + // dependence sets (in which case RunningDepId > 2) or if we need to re-try + // any bound checks (because in that case the number of dependence sets is + // incomplete). + bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty(); // We need to perform run-time alias checks, but some pointers had bounds // that couldn't be checked. @@ -932,7 +936,7 @@ void AccessAnalysis::processMemAccesses() { typedef SmallVector ValueVector; ValueVector TempObjects; - GetUnderlyingObjects(Ptr, TempObjects, DL, LI); + getUnderlyingObjects(Ptr, TempObjects, LI); LLVM_DEBUG(dbgs() << "Underlying objects for pointer " << *Ptr << "\n"); for (const Value *UnderlyingObj : TempObjects) { @@ -1136,7 +1140,7 @@ bool llvm::sortPtrAccesses(ArrayRef VL, const DataLayout &DL, // first pointer in the array. Value *Ptr0 = VL[0]; const SCEV *Scev0 = SE.getSCEV(Ptr0); - Value *Obj0 = GetUnderlyingObject(Ptr0, DL); + Value *Obj0 = getUnderlyingObject(Ptr0); llvm::SmallSet Offsets; for (auto *Ptr : VL) { @@ -1147,7 +1151,7 @@ bool llvm::sortPtrAccesses(ArrayRef VL, const DataLayout &DL, return false; // If a pointer refers to a different underlying object, bail - the // pointers are by definition incomparable. - Value *CurrObj = GetUnderlyingObject(Ptr, DL); + Value *CurrObj = getUnderlyingObject(Ptr); if (CurrObj != Obj0) return false; @@ -1941,10 +1945,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, } MemoryDepChecker::DepCandidates DependentAccesses; - AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(), - TheLoop, AA, LI, DependentAccesses, *PSE); + AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE); - // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects + // Holds the analyzed pointers. We don't want to call getUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once // for read and once for write, it will only appear once (on the write // list). This is okay, since we are going to check for conflicts between diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 566eba5c54aff..7f3de0fcf140a 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -406,8 +406,6 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( isInvariantLoad = true; } - const DataLayout &DL = BB->getModule()->getDataLayout(); - // Return "true" if and only if the instruction I is either a non-simple // load or a non-simple store. auto isNonSimpleLoadOrStore = [](Instruction *I) -> bool { @@ -576,7 +574,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // looking for a clobber in many cases; that's an alias property and is // handled by BasicAA. if (isa(Inst) || isNoAliasFn(Inst, &TLI)) { - const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, DL); + const Value *AccessPtr = getUnderlyingObject(MemLoc.Ptr); if (AccessPtr == Inst || AA.isMustAlias(Inst, AccessPtr)) return MemDepResult::getDef(Inst); } diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index f2f5fd70f4718..6fc827ae2b17c 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -519,9 +519,16 @@ template class ClobberWalker { UpwardsMemoryQuery *Query; unsigned *UpwardWalkLimit; - // Phi optimization bookkeeping + // Phi optimization bookkeeping: + // List of DefPath to process during the current phi optimization walk. SmallVector Paths; + // List of visited pairs; we can skip paths already + // visited with the same memory location. DenseSet VisitedPhis; + // Record if phi translation has been performed during the current phi + // optimization walk, as merging alias results after phi translation can + // yield incorrect results. Context in PR46156. + bool PerformedPhiTranslation = false; /// Find the nearest def or phi that `From` can legally be optimized to. const MemoryAccess *getWalkTarget(const MemoryPhi *From) const { @@ -596,12 +603,13 @@ template class ClobberWalker { void addSearches(MemoryPhi *Phi, SmallVectorImpl &PausedSearches, ListIndex PriorNode) { - auto UpwardDefs = make_range( - upward_defs_begin({Phi, Paths[PriorNode].Loc}, DT), upward_defs_end()); + auto UpwardDefsBegin = upward_defs_begin({Phi, Paths[PriorNode].Loc}, DT); + auto UpwardDefs = make_range(UpwardDefsBegin, upward_defs_end()); for (const MemoryAccessPair &P : UpwardDefs) { PausedSearches.push_back(Paths.size()); Paths.emplace_back(P.second, P.first, PriorNode); } + PerformedPhiTranslation |= UpwardDefsBegin.performedPhiTranslation(); } /// Represents a search that terminated after finding a clobber. This clobber @@ -651,8 +659,16 @@ template class ClobberWalker { // - We still cache things for A, so C only needs to walk up a bit. // If this behavior becomes problematic, we can fix without a ton of extra // work. - if (!VisitedPhis.insert({Node.Last, Node.Loc}).second) + if (!VisitedPhis.insert({Node.Last, Node.Loc}).second) { + if (PerformedPhiTranslation) { + // If visiting this path performed Phi translation, don't continue, + // since it may not be correct to merge results from two paths if one + // relies on the phi translation. + TerminatedPath Term{Node.Last, PathIndex}; + return Term; + } continue; + } const MemoryAccess *SkipStopWhere = nullptr; if (Query->SkipSelfAccess && Node.Loc == Query->StartingLoc) { @@ -765,7 +781,7 @@ template class ClobberWalker { /// terminates when a MemoryAccess that clobbers said MemoryLocation is found. OptznResult tryOptimizePhi(MemoryPhi *Phi, MemoryAccess *Start, const MemoryLocation &Loc) { - assert(Paths.empty() && VisitedPhis.empty() && + assert(Paths.empty() && VisitedPhis.empty() && !PerformedPhiTranslation && "Reset the optimization state."); Paths.emplace_back(Loc, Start, Phi, None); @@ -921,6 +937,7 @@ template class ClobberWalker { void resetPhiOptznState() { Paths.clear(); VisitedPhis.clear(); + PerformedPhiTranslation = false; } public: diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp index 85af091772e7e..81582413a6042 100644 --- a/llvm/lib/Analysis/MemorySSAUpdater.cpp +++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp @@ -319,8 +319,7 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) { bool DefBeforeSameBlock = false; if (DefBefore->getBlock() == MD->getBlock() && !(isa(DefBefore) && - std::find(InsertedPHIs.begin(), InsertedPHIs.end(), DefBefore) != - InsertedPHIs.end())) + llvm::is_contained(InsertedPHIs, DefBefore))) DefBeforeSameBlock = true; // There is a def before us, which means we can replace any store/phi uses @@ -832,8 +831,8 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef Updates, // Check number of predecessors, we only care if there's more than one. unsigned Count = 0; BasicBlock *Pred = nullptr; - for (auto &Pair : children({GD, BB})) { - Pred = Pair.second; + for (auto *Pi : GD->template getChildren(BB)) { + Pred = Pi; Count++; if (Count == 2) break; @@ -926,8 +925,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef Updates, auto *BB = BBPredPair.first; const auto &AddedBlockSet = BBPredPair.second.Added; auto &PrevBlockSet = BBPredPair.second.Prev; - for (auto &Pair : children({GD, BB})) { - BasicBlock *Pi = Pair.second; + for (auto *Pi : GD->template getChildren(BB)) { if (!AddedBlockSet.count(Pi)) PrevBlockSet.insert(Pi); EdgeCountMap[{Pi, BB}]++; @@ -1078,10 +1076,8 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef Updates, for (unsigned I = 0, E = IDFPhi->getNumIncomingValues(); I < E; ++I) IDFPhi->setIncomingValue(I, GetLastDef(IDFPhi->getIncomingBlock(I))); } else { - for (auto &Pair : children({GD, BBIDF})) { - BasicBlock *Pi = Pair.second; + for (auto *Pi : GD->template getChildren(BBIDF)) IDFPhi->addIncoming(GetLastDef(Pi), Pi); - } } } } diff --git a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp index 80e019f5fc921..58eb5281a717d 100644 --- a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp +++ b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp @@ -54,8 +54,8 @@ AliasResult ObjCARCAAResult::alias(const MemoryLocation &LocA, // If that failed, climb to the underlying object, including climbing through // ObjC-specific no-ops, and try making an imprecise alias query. - const Value *UA = GetUnderlyingObjCPtr(SA, DL); - const Value *UB = GetUnderlyingObjCPtr(SB, DL); + const Value *UA = GetUnderlyingObjCPtr(SA); + const Value *UB = GetUnderlyingObjCPtr(SB); if (UA != SA || UB != SB) { Result = AAResultBase::alias(MemoryLocation(UA), MemoryLocation(UB), AAQI); // We can't use MustAlias or PartialAlias results here because @@ -83,7 +83,7 @@ bool ObjCARCAAResult::pointsToConstantMemory(const MemoryLocation &Loc, // If that failed, climb to the underlying object, including climbing through // ObjC-specific no-ops, and try making an imprecise alias query. - const Value *U = GetUnderlyingObjCPtr(S, DL); + const Value *U = GetUnderlyingObjCPtr(S); if (U != S) return AAResultBase::pointsToConstantMemory(MemoryLocation(U), AAQI, OrLocal); diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index a961be8cc35e1..23ad77ff60540 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -10628,8 +10628,15 @@ ScalarEvolution::howManyGreaterThans(const SCEV *LHS, const SCEV *RHS, const SCEV *Start = IV->getStart(); const SCEV *End = RHS; - if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS)) - End = IsSigned ? getSMinExpr(RHS, Start) : getUMinExpr(RHS, Start); + if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS)) { + // If we know that Start >= RHS in the context of loop, then we know that + // min(RHS, Start) = RHS at this point. + if (isLoopEntryGuardedByCond( + L, IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, Start, RHS)) + End = RHS; + else + End = IsSigned ? getSMinExpr(RHS, Start) : getUMinExpr(RHS, Start); + } const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false); diff --git a/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/llvm/lib/Analysis/ScopedNoAliasAA.cpp index 8928678d6ab21..5e2aaab050afe 100644 --- a/llvm/lib/Analysis/ScopedNoAliasAA.cpp +++ b/llvm/lib/Analysis/ScopedNoAliasAA.cpp @@ -185,7 +185,7 @@ ScopedNoAliasAAResult ScopedNoAliasAA::run(Function &F, char ScopedNoAliasAAWrapperPass::ID = 0; -INITIALIZE_PASS(ScopedNoAliasAAWrapperPass, "scoped-noalias", +INITIALIZE_PASS(ScopedNoAliasAAWrapperPass, "scoped-noalias-aa", "Scoped NoAlias Alias Analysis", false, true) ImmutablePass *llvm::createScopedNoAliasAAWrapperPass() { diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp index bbfc303aefac4..bcbe9a63c182d 100644 --- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp +++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp @@ -928,7 +928,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) { for (auto &GVS : Index) { for (auto &GV : GVS.second.SummaryList) { FunctionSummary *FS = dyn_cast(GV.get()); - if (!FS) + if (!FS || FS->paramAccesses().empty()) continue; if (FS->isLive() && FS->isDSOLocal()) { FunctionInfo FI; diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp index a6f5b2956399e..8fd4011e6cd42 100644 --- a/llvm/lib/Analysis/TFUtils.cpp +++ b/llvm/lib/Analysis/TFUtils.cpp @@ -13,9 +13,10 @@ #include "llvm/Config/config.h" #if defined(LLVM_HAVE_TF_API) -#include "llvm/Analysis/Utils/TFUtils.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/Utils/TFUtils.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/JSON.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/raw_ostream.h" @@ -83,11 +84,46 @@ class EvaluationResultImpl { std::vector Output; }; +Optional getTensorSpecFromJSON(LLVMContext &Ctx, + const json::Value &Value) { + auto EmitError = [&](const llvm::Twine &Message) -> Optional { + std::string S; + llvm::raw_string_ostream OS(S); + OS << Value; + Ctx.emitError("Unable to parse JSON Value as spec (" + Message + "): " + S); + return None; + }; + json::ObjectMapper Mapper(Value); + if (!Mapper) + return EmitError("Value is not a dict"); + + std::string TensorName; + int TensorPort = -1; + std::string TensorType; + std::vector TensorShape; + + if (!Mapper.map("name", TensorName)) + return EmitError("'name' property not present or not a string"); + if (!Mapper.map("type", TensorType)) + return EmitError("'type' property not present or not a string"); + if (!Mapper.map("port", TensorPort)) + return EmitError("'port' property not present or not an int"); + if (!Mapper.map>("shape", TensorShape)) + return EmitError("'shape' property not present or not an int array"); + +#define PARSE_TYPE(T, S, E) \ + if (TensorType == #S) \ + return TensorSpec::createSpec(TensorName, TensorShape, TensorPort); + TFUTILS_SUPPORTED_TYPES(PARSE_TYPE) +#undef PARSE_TYPE + return None; +} + class TFModelEvaluatorImpl { public: TFModelEvaluatorImpl(StringRef SavedModelPath, - const std::vector &InputNames, - const std::vector &OutputNames, + const std::vector &InputSpecs, + const std::vector &OutputSpecs, const char *Tags); bool isValid() const { return IsValid; } @@ -132,16 +168,17 @@ class TFModelEvaluatorImpl { /// Reusable utility for ensuring we can bind the requested Name to a node in /// the SavedModel Graph. - bool checkReportAndInvalidate(const TF_Output &Output, StringRef Name); + bool checkReportAndInvalidate(const TF_Output &Output, + const TensorSpec &OutputSpec); }; } // namespace llvm TFModelEvaluatorImpl::TFModelEvaluatorImpl( - StringRef SavedModelPath, const std::vector &InputNames, - const std::vector &OutputNames, const char *Tags) + StringRef SavedModelPath, const std::vector &InputSpecs, + const std::vector &OutputSpecs, const char *Tags) : Graph(createTFGraph()), Options(createTFSessionOptions()), - InputFeed(InputNames.size()), Input(InputNames.size()), - OutputFeed(OutputNames.size()) { + InputFeed(InputSpecs.size()), Input(InputSpecs.size()), + OutputFeed(OutputSpecs.size()) { if (!ensureInitTF()) { errs() << "Tensorflow should have been initialized"; return; @@ -155,25 +192,31 @@ TFModelEvaluatorImpl::TFModelEvaluatorImpl( errs() << TF_Message(Status.get()); invalidate(); } - for (size_t I = 0; I < InputNames.size(); ++I) { + for (size_t I = 0; I < InputSpecs.size(); ++I) { + auto &InputSpec = InputSpecs[I]; InputFeed[I] = { - TF_GraphOperationByName(Graph.get(), (InputNames[I]).c_str()), 0}; - if (!checkReportAndInvalidate(InputFeed[I], InputNames[I])) + TF_GraphOperationByName(Graph.get(), (InputSpec.name()).c_str()), + InputSpec.port()}; + if (!checkReportAndInvalidate(InputFeed[I], InputSpec)) return; + initInput(I, static_cast(InputSpec.typeIndex()), + InputSpec.shape()); } - for (size_t I = 0; I < OutputNames.size(); ++I) { + for (size_t I = 0; I < OutputSpecs.size(); ++I) { + auto &OutputSpec = OutputSpecs[I]; OutputFeed[I] = { - TF_GraphOperationByName(Graph.get(), (OutputNames[I]).c_str()), 0}; - if (!checkReportAndInvalidate(OutputFeed[I], OutputNames[I])) + TF_GraphOperationByName(Graph.get(), (OutputSpec.name()).c_str()), + OutputSpec.port()}; + if (!checkReportAndInvalidate(OutputFeed[I], OutputSpec)) return; } } TFModelEvaluator::TFModelEvaluator(StringRef SavedModelPath, - const std::vector &InputNames, - const std::vector &OutputNames, + const std::vector &InputSpecs, + const std::vector &OutputSpecs, const char *Tags) - : Impl(new TFModelEvaluatorImpl(SavedModelPath, InputNames, OutputNames, + : Impl(new TFModelEvaluatorImpl(SavedModelPath, InputSpecs, OutputSpecs, Tags)) { if (!Impl->isValid()) Impl.reset(); @@ -192,11 +235,11 @@ TFModelEvaluatorImpl::~TFModelEvaluatorImpl() { errs() << "Could not delete TF session"; } -bool TFModelEvaluatorImpl::checkReportAndInvalidate(const TF_Output &Output, - StringRef Name) { +bool TFModelEvaluatorImpl::checkReportAndInvalidate( + const TF_Output &Output, const TensorSpec &OutputSpec) { if (Output.oper) return true; - errs() << "Could not find TF_Output named: " + Name; + errs() << "Could not find TF_Output named: " + OutputSpec.name(); IsValid = false; return IsValid; } @@ -242,50 +285,12 @@ void *TFModelEvaluator::EvaluationResult::getUntypedTensorValue(size_t Index) { return TF_TensorData(Impl->getOutput()[Index]); } -void TFModelEvaluator::initInput(size_t Index, int TypeIndex, - const std::vector &Dimensions) { - Impl->initInput(Index, static_cast(TypeIndex), Dimensions); -} - -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_FLOAT; -} - -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_DOUBLE; -} - -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_INT8; -} - -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_UINT8; -} - -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_INT16; -} - -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_UINT16; -} - -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_INT32; -} - -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_UINT32; -} +#define TFUTILS_GETDATATYPE_IMPL(T, S, E) \ + template <> int TensorSpec::getDataType() { return TF_##E; } -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_INT64; -} +TFUTILS_SUPPORTED_TYPES(TFUTILS_GETDATATYPE_IMPL) -template <> int TFModelEvaluator::getModelTypeIndex() { - return TF_UINT64; -} +#undef TFUTILS_GETDATATYPE_IMPL TFModelEvaluator::EvaluationResult::~EvaluationResult() {} TFModelEvaluator::~TFModelEvaluator() {} diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c9e702ce56b81..944e621fc0701 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -730,12 +730,57 @@ int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, VectorType *Ty, return Cost; } +TTI::CastContextHint +TargetTransformInfo::getCastContextHint(const Instruction *I) { + if (!I) + return CastContextHint::None; + + auto getLoadStoreKind = [](const Value *V, unsigned LdStOp, unsigned MaskedOp, + unsigned GatScatOp) { + const Instruction *I = dyn_cast(V); + if (!I) + return CastContextHint::None; + + if (I->getOpcode() == LdStOp) + return CastContextHint::Normal; + + if (const IntrinsicInst *II = dyn_cast(I)) { + if (II->getIntrinsicID() == MaskedOp) + return TTI::CastContextHint::Masked; + if (II->getIntrinsicID() == GatScatOp) + return TTI::CastContextHint::GatherScatter; + } + + return TTI::CastContextHint::None; + }; + + switch (I->getOpcode()) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPExt: + return getLoadStoreKind(I->getOperand(0), Instruction::Load, + Intrinsic::masked_load, Intrinsic::masked_gather); + case Instruction::Trunc: + case Instruction::FPTrunc: + if (I->hasOneUse()) + return getLoadStoreKind(*I->user_begin(), Instruction::Store, + Intrinsic::masked_store, + Intrinsic::masked_scatter); + break; + default: + return CastContextHint::None; + } + + return TTI::CastContextHint::None; +} + int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const { assert((I == nullptr || I->getOpcode() == Opcode) && "Opcode should reflect passed instruction."); - int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, CostKind, I); + int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0ab2a1350af31..af55c96f3a531 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1642,6 +1642,28 @@ static void computeKnownBitsFromOperator(const Operator *I, if (const IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { default: break; + case Intrinsic::abs: + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + + // If the source's MSB is zero then we know the rest of the bits. + if (Known2.isNonNegative()) { + Known.Zero |= Known2.Zero; + Known.One |= Known2.One; + break; + } + + // Absolute value preserves trailing zero count. + Known.Zero.setLowBits(Known2.Zero.countTrailingOnes()); + + // If this call is undefined for INT_MIN, the result is positive. We + // also know it can't be INT_MIN if there is a set bit that isn't the + // sign bit. + Known2.One.clearSignBit(); + if (match(II->getArgOperand(1), m_One()) || Known2.One.getBoolValue()) + Known.Zero.setSignBit(); + // FIXME: Handle known negative input? + // FIXME: Calculate the negated Known bits and combine them? + break; case Intrinsic::bitreverse: computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); Known.Zero |= Known2.Zero.reverseBits(); @@ -2990,6 +3012,19 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, "Failed to determine minimum sign bits"); return Tmp; } + case Instruction::Call: { + if (const auto *II = dyn_cast(U)) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::abs: + Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); + if (Tmp == 1) break; + + // Absolute value reduces number of sign bits by at most 1. + return Tmp - 1; + } + } + } } } @@ -4149,8 +4184,7 @@ static bool isSameUnderlyingObjectInLoop(const PHINode *PN, return true; } -Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL, - unsigned MaxLookup) { +Value *llvm::getUnderlyingObject(Value *V, unsigned MaxLookup) { if (!V->getType()->isPointerTy()) return V; for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) { @@ -4195,16 +4229,15 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL, return V; } -void llvm::GetUnderlyingObjects(const Value *V, +void llvm::getUnderlyingObjects(const Value *V, SmallVectorImpl &Objects, - const DataLayout &DL, LoopInfo *LI, - unsigned MaxLookup) { + LoopInfo *LI, unsigned MaxLookup) { SmallPtrSet Visited; SmallVector Worklist; Worklist.push_back(V); do { const Value *P = Worklist.pop_back_val(); - P = GetUnderlyingObject(P, DL, MaxLookup); + P = getUnderlyingObject(P, MaxLookup); if (!Visited.insert(P).second) continue; @@ -4265,19 +4298,18 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) { } while (true); } -/// This is a wrapper around GetUnderlyingObjects and adds support for basic +/// This is a wrapper around getUnderlyingObjects and adds support for basic /// ptrtoint+arithmetic+inttoptr sequences. -/// It returns false if unidentified object is found in GetUnderlyingObjects. +/// It returns false if unidentified object is found in getUnderlyingObjects. bool llvm::getUnderlyingObjectsForCodeGen(const Value *V, - SmallVectorImpl &Objects, - const DataLayout &DL) { + SmallVectorImpl &Objects) { SmallPtrSet Visited; SmallVector Working(1, V); do { V = Working.pop_back_val(); SmallVector Objs; - GetUnderlyingObjects(V, Objs, DL); + getUnderlyingObjects(V, Objs); for (const Value *V : Objs) { if (!Visited.insert(V).second) @@ -4290,7 +4322,7 @@ bool llvm::getUnderlyingObjectsForCodeGen(const Value *V, continue; } } - // If GetUnderlyingObjects fails to find an identifiable object, + // If getUnderlyingObjects fails to find an identifiable object, // getUnderlyingObjectsForCodeGen also fails for safety. if (!isIdentifiedObject(V)) { Objects.clear(); @@ -4302,18 +4334,72 @@ bool llvm::getUnderlyingObjectsForCodeGen(const Value *V, return true; } -/// Return true if the only users of this pointer are lifetime markers. -bool llvm::onlyUsedByLifetimeMarkers(const Value *V) { +static AllocaInst * +findAllocaForValue(Value *V, DenseMap &AllocaForValue) { + if (AllocaInst *AI = dyn_cast(V)) + return AI; + // See if we've already calculated (or started to calculate) alloca for a + // given value. + auto I = AllocaForValue.find(V); + if (I != AllocaForValue.end()) + return I->second; + // Store 0 while we're calculating alloca for value V to avoid + // infinite recursion if the value references itself. + AllocaForValue[V] = nullptr; + AllocaInst *Res = nullptr; + if (CastInst *CI = dyn_cast(V)) + Res = findAllocaForValue(CI->getOperand(0), AllocaForValue); + else if (PHINode *PN = dyn_cast(V)) { + for (Value *IncValue : PN->incoming_values()) { + // Allow self-referencing phi-nodes. + if (IncValue == PN) + continue; + AllocaInst *IncValueAI = findAllocaForValue(IncValue, AllocaForValue); + // AI for incoming values should exist and should all be equal. + if (IncValueAI == nullptr || (Res != nullptr && IncValueAI != Res)) + return nullptr; + Res = IncValueAI; + } + } else if (GetElementPtrInst *EP = dyn_cast(V)) { + Res = findAllocaForValue(EP->getPointerOperand(), AllocaForValue); + } + if (Res) + AllocaForValue[V] = Res; + return Res; +} + +AllocaInst *llvm::findAllocaForValue(Value *V) { + DenseMap AllocaForValue; + return ::findAllocaForValue(V, AllocaForValue); +} + +static bool onlyUsedByLifetimeMarkersOrDroppableInstsHelper( + const Value *V, bool AllowLifetime, bool AllowDroppable) { for (const User *U : V->users()) { const IntrinsicInst *II = dyn_cast(U); - if (!II) return false; - - if (!II->isLifetimeStartOrEnd()) + if (!II) return false; + + if (AllowLifetime && II->isLifetimeStartOrEnd()) + continue; + + if (AllowDroppable && II->isDroppable()) + continue; + + return false; } return true; } +bool llvm::onlyUsedByLifetimeMarkers(const Value *V) { + return onlyUsedByLifetimeMarkersOrDroppableInstsHelper( + V, /* AllowLifetime */ true, /* AllowDroppable */ false); +} +bool llvm::onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V) { + return onlyUsedByLifetimeMarkersOrDroppableInstsHelper( + V, /* AllowLifetime */ true, /* AllowDroppable */ true); +} + bool llvm::mustSuppressSpeculation(const LoadInst &LI) { if (!LI.isUnordered()) return true; @@ -4758,7 +4844,7 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly) { const auto *CE = dyn_cast(Op); if (isa(Op) || (CE && CE->isCast())) return false; - else if (isa(Op)) + else if (Instruction::isBinaryOp(Opcode)) return false; // Be conservative and return true. return true; @@ -4794,8 +4880,8 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, isa(C) || isa(C)) return true; - if (C->getType()->isVectorTy()) - return !C->containsUndefElement() && !C->containsConstantExpression(); + if (C->getType()->isVectorTy() && !isa(C)) + return !C->containsConstantExpression() && !C->containsUndefElement(); } // Strip cast operations from a pointer value. diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index fbd5c80350627..cdcbd1579e28d 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -43,13 +43,18 @@ static cl::opt MaxInterleaveGroupFactor( /// hasVectorInstrinsicScalarOpd). bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { switch (ID) { - case Intrinsic::bswap: // Begin integer bit-manipulation. + case Intrinsic::abs: // Begin integer bit-manipulation. + case Intrinsic::bswap: case Intrinsic::bitreverse: case Intrinsic::ctpop: case Intrinsic::ctlz: case Intrinsic::cttz: case Intrinsic::fshl: case Intrinsic::fshr: + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: case Intrinsic::sadd_sat: case Intrinsic::ssub_sat: case Intrinsic::uadd_sat: @@ -94,6 +99,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx) { switch (ID) { + case Intrinsic::abs: case Intrinsic::ctlz: case Intrinsic::cttz: case Intrinsic::powi: diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 9632b5700e8af..82b6f2078695f 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -20,8 +20,9 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" -#include "llvm/Bitstream/BitstreamReader.h" +#include "llvm/Bitcode/BitcodeCommon.h" #include "llvm/Bitcode/LLVMBitCodes.h" +#include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" @@ -1715,7 +1716,7 @@ Error BitcodeReader::parseTypeTableBody() { case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries] // TYPE_CODE_NUMENTRY contains a count of the number of types in the // type list. This allows us to reserve space. - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); TypeList.resize(Record[0]); continue; @@ -1756,7 +1757,7 @@ Error BitcodeReader::parseTypeTableBody() { ResultTy = Type::getTokenTy(Context); break; case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); uint64_t NumBits = Record[0]; @@ -1768,7 +1769,7 @@ Error BitcodeReader::parseTypeTableBody() { } case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or // [pointee type, address space] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); unsigned AddressSpace = 0; if (Record.size() == 2) @@ -1823,7 +1824,7 @@ Error BitcodeReader::parseTypeTableBody() { break; } case bitc::TYPE_CODE_STRUCT_ANON: { // STRUCT: [ispacked, eltty x N] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); SmallVector EltTys; for (unsigned i = 1, e = Record.size(); i != e; ++i) { @@ -1843,7 +1844,7 @@ Error BitcodeReader::parseTypeTableBody() { continue; case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); if (NumRecords >= TypeList.size()) @@ -3715,7 +3716,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit, break; /// MODULE_CODE_VSTOFFSET: [offset] case bitc::MODULE_CODE_VSTOFFSET: - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); // Note that we subtract 1 here because the offset is relative to one word // before the start of the identification or module block, which was @@ -3869,7 +3870,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { default: // Default behavior: reject return error("Invalid value"); case bitc::FUNC_CODE_DECLAREBLOCKS: { // DECLAREBLOCKS: [nblocks] - if (Record.size() < 1 || Record[0] == 0) + if (Record.empty() || Record[0] == 0) return error("Invalid record"); // Create all the basic blocks for the function. FunctionBBs.resize(Record[0]); @@ -4711,7 +4712,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { InstructionList.push_back(I); break; case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...] - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); // The first record specifies the type. FullTy = getFullyStructuredTypeByID(Record[0]); @@ -4813,17 +4814,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) { case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align] if (Record.size() != 4) return error("Invalid record"); - uint64_t AlignRecord = Record[3]; - const uint64_t InAllocaMask = uint64_t(1) << 5; - const uint64_t ExplicitTypeMask = uint64_t(1) << 6; - const uint64_t SwiftErrorMask = uint64_t(1) << 7; - const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask | - SwiftErrorMask; - bool InAlloca = AlignRecord & InAllocaMask; - bool SwiftError = AlignRecord & SwiftErrorMask; + using APV = AllocaPackedValues; + const uint64_t Rec = Record[3]; + const bool InAlloca = Bitfield::get(Rec); + const bool SwiftError = Bitfield::get(Rec); FullTy = getFullyStructuredTypeByID(Record[0]); Type *Ty = flattenPointerTypes(FullTy); - if ((AlignRecord & ExplicitTypeMask) == 0) { + if (!Bitfield::get(Rec)) { auto *PTy = dyn_cast_or_null(Ty); if (!PTy) return error("Old-style alloca with a non-pointer type"); @@ -4832,7 +4829,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) { Type *OpTy = getTypeByID(Record[1]); Value *Size = getFnValueByID(Record[2], OpTy); MaybeAlign Align; - if (Error Err = parseAlignmentValue(AlignRecord & ~FlagMask, Align)) { + if (Error Err = + parseAlignmentValue(Bitfield::get(Rec), Align)) { return Err; } if (!Ty || !Size) @@ -5207,7 +5205,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) { // number of operand bundle blocks. These blocks are read into // OperandBundles and consumed at the next call or invoke instruction. - if (Record.size() < 1 || Record[0] >= BundleTags.size()) + if (Record.empty() || Record[0] >= BundleTags.size()) return error("Invalid record"); std::vector Inputs; @@ -5740,7 +5738,7 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() { } /// MODULE_CODE_VSTOFFSET: [offset] case bitc::MODULE_CODE_VSTOFFSET: - if (Record.size() < 1) + if (Record.empty()) return error("Invalid record"); // Note that we subtract 1 here because the offset is relative to one // word before the start of the identification or module block, which diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 7d410b24573e7..7afef397e05ed 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Bitcode/BitcodeCommon.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/LLVMBitCodes.h" #include "llvm/Bitstream/BitCodes.h" @@ -394,6 +395,8 @@ class ModuleBitcodeWriter : public ModuleBitcodeWriterBase { unsigned getEncodedSyncScopeID(SyncScope::ID SSID) { return unsigned(SSID); } + + unsigned getEncodedAlign(MaybeAlign Alignment) { return encode(Alignment); } }; /// Class to manage the bitcode writing for a combined index. @@ -1181,10 +1184,14 @@ void ModuleBitcodeWriter::writeModuleInfo() { // compute the maximum alignment value. std::map SectionMap; std::map GCMap; - unsigned MaxAlignment = 0; + MaybeAlign MaxAlignment; unsigned MaxGlobalType = 0; + const auto UpdateMaxAlignment = [&MaxAlignment](const MaybeAlign A) { + if (A) + MaxAlignment = !MaxAlignment ? *A : std::max(*MaxAlignment, *A); + }; for (const GlobalVariable &GV : M.globals()) { - MaxAlignment = std::max(MaxAlignment, GV.getAlignment()); + UpdateMaxAlignment(GV.getAlign()); MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getValueType())); if (GV.hasSection()) { // Give section names unique ID's. @@ -1197,7 +1204,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { } } for (const Function &F : M) { - MaxAlignment = std::max(MaxAlignment, F.getAlignment()); + UpdateMaxAlignment(F.getAlign()); if (F.hasSection()) { // Give section names unique ID's. unsigned &Entry = SectionMap[std::string(F.getSection())]; @@ -1233,10 +1240,10 @@ void ModuleBitcodeWriter::writeModuleInfo() { //| constant Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Initializer. Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5)); // Linkage. - if (MaxAlignment == 0) // Alignment. + if (!MaxAlignment) // Alignment. Abbv->Add(BitCodeAbbrevOp(0)); else { - unsigned MaxEncAlignment = Log2_32(MaxAlignment)+1; + unsigned MaxEncAlignment = getEncodedAlign(MaxAlignment); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(MaxEncAlignment+1))); } @@ -1289,7 +1296,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { Vals.push_back(GV.isDeclaration() ? 0 : (VE.getValueID(GV.getInitializer()) + 1)); Vals.push_back(getEncodedLinkage(GV)); - Vals.push_back(Log2_32(GV.getAlignment())+1); + Vals.push_back(getEncodedAlign(GV.getAlign())); Vals.push_back(GV.hasSection() ? SectionMap[std::string(GV.getSection())] : 0); if (GV.isThreadLocal() || @@ -1335,7 +1342,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { Vals.push_back(F.isDeclaration()); Vals.push_back(getEncodedLinkage(F)); Vals.push_back(VE.getAttributeListID(F.getAttributes())); - Vals.push_back(Log2_32(F.getAlignment())+1); + Vals.push_back(getEncodedAlign(F.getAlign())); Vals.push_back(F.hasSection() ? SectionMap[std::string(F.getSection())] : 0); Vals.push_back(getEncodedVisibility(F)); @@ -2945,14 +2952,13 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Vals.push_back(VE.getTypeID(AI.getAllocatedType())); Vals.push_back(VE.getTypeID(I.getOperand(0)->getType())); Vals.push_back(VE.getValueID(I.getOperand(0))); // size. - unsigned AlignRecord = Log2_32(AI.getAlignment()) + 1; - assert(Log2_32(Value::MaximumAlignment) + 1 < 1 << 5 && - "not enough bits for maximum alignment"); - assert(AlignRecord < 1 << 5 && "alignment greater than 1 << 64"); - AlignRecord |= AI.isUsedWithInAlloca() << 5; - AlignRecord |= 1 << 6; - AlignRecord |= AI.isSwiftError() << 7; - Vals.push_back(AlignRecord); + using APV = AllocaPackedValues; + unsigned Record = 0; + Bitfield::set(Record, getEncodedAlign(AI.getAlign())); + Bitfield::set(Record, AI.isUsedWithInAlloca()); + Bitfield::set(Record, true); + Bitfield::set(Record, AI.isSwiftError()); + Vals.push_back(Record); break; } @@ -2966,7 +2972,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, AbbrevToUse = FUNCTION_INST_LOAD_ABBREV; } Vals.push_back(VE.getTypeID(I.getType())); - Vals.push_back(Log2_32(cast(I).getAlignment())+1); + Vals.push_back(getEncodedAlign(cast(I).getAlign())); Vals.push_back(cast(I).isVolatile()); if (cast(I).isAtomic()) { Vals.push_back(getEncodedOrdering(cast(I).getOrdering())); @@ -2980,7 +2986,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Code = bitc::FUNC_CODE_INST_STORE; pushValueAndType(I.getOperand(1), InstID, Vals); // ptrty + ptr pushValueAndType(I.getOperand(0), InstID, Vals); // valty + val - Vals.push_back(Log2_32(cast(I).getAlignment())+1); + Vals.push_back(getEncodedAlign(cast(I).getAlign())); Vals.push_back(cast(I).isVolatile()); if (cast(I).isAtomic()) { Vals.push_back(getEncodedOrdering(cast(I).getOrdering())); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index b50cae89b3494..78f18ab8aff6e 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2295,6 +2295,16 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { } switch (CE->getOpcode()) { + case Instruction::AddrSpaceCast: { + const Constant *Op = CE->getOperand(0); + unsigned DstAS = CE->getType()->getPointerAddressSpace(); + unsigned SrcAS = Op->getType()->getPointerAddressSpace(); + if (TM.isNoopAddrSpaceCast(SrcAS, DstAS)) + return lowerConstant(Op); + + // Fallthrough to error. + LLVM_FALLTHROUGH; + } default: { // If the code isn't optimized, there may be outstanding folding // opportunities. Attempt to fold the expression using DataLayout as a diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp index d81a9be26d39b..b6a9a95683603 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp @@ -241,6 +241,7 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const { OutStreamer->emitCFIGnuArgsSize(Inst.getOffset()); break; case MCCFIInstruction::OpEscape: + OutStreamer->AddComment(Inst.getComment()); OutStreamer->emitCFIEscape(Inst.getValues()); break; case MCCFIInstruction::OpRestore: diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 5d52e7fb80920..b388e43447835 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -494,8 +494,7 @@ void CodeViewDebug::recordLocalVariable(LocalVariable &&Var, static void addLocIfNotPresent(SmallVectorImpl &Locs, const DILocation *Loc) { - auto B = Locs.begin(), E = Locs.end(); - if (std::find(B, E, Loc) == E) + if (!llvm::is_contained(Locs, Loc)) Locs.push_back(Loc); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp index 713a15dd09391..0524d666810cb 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp @@ -495,19 +495,25 @@ void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; } /// EmitValue - Emit label value. /// void DIELabel::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { - AP->emitLabelReference( - Label, SizeOf(AP, Form), - Form == dwarf::DW_FORM_strp || Form == dwarf::DW_FORM_sec_offset || - Form == dwarf::DW_FORM_ref_addr || Form == dwarf::DW_FORM_data4); + bool IsSectionRelative = Form != dwarf::DW_FORM_addr; + AP->emitLabelReference(Label, SizeOf(AP, Form), IsSectionRelative); } /// SizeOf - Determine size of label value in bytes. /// unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { - if (Form == dwarf::DW_FORM_data4) return 4; - if (Form == dwarf::DW_FORM_sec_offset) return 4; - if (Form == dwarf::DW_FORM_strp) return 4; - return AP->MAI->getCodePointerSize(); + switch (Form) { + case dwarf::DW_FORM_data4: + return 4; + case dwarf::DW_FORM_sec_offset: + case dwarf::DW_FORM_strp: + // FIXME: add support for DWARF64 + return 4; + case dwarf::DW_FORM_addr: + return AP->MAI->getCodePointerSize(); + default: + llvm_unreachable("DIE Value form not supported yet"); + } } LLVM_DUMP_METHOD @@ -543,10 +549,15 @@ void DIEDelta::emitValue(const AsmPrinter *AP, dwarf::Form Form) const { /// SizeOf - Determine size of delta value in bytes. /// unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { - if (Form == dwarf::DW_FORM_data4) return 4; - if (Form == dwarf::DW_FORM_sec_offset) return 4; - if (Form == dwarf::DW_FORM_strp) return 4; - return AP->MAI->getCodePointerSize(); + switch (Form) { + case dwarf::DW_FORM_data4: + return 4; + case dwarf::DW_FORM_sec_offset: + // FIXME: add support for DWARF64 + return 4; + default: + llvm_unreachable("DIE Value form not supported yet"); + } } LLVM_DUMP_METHOD @@ -809,13 +820,17 @@ void DIEBlock::print(raw_ostream &O) const { //===----------------------------------------------------------------------===// unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { - if (Form == dwarf::DW_FORM_loclistx) + switch (Form) { + case dwarf::DW_FORM_loclistx: return getULEB128Size(Index); - if (Form == dwarf::DW_FORM_data4) + case dwarf::DW_FORM_data4: return 4; - if (Form == dwarf::DW_FORM_sec_offset) + case dwarf::DW_FORM_sec_offset: + // FIXME: add support for DWARF64 return 4; - return AP->MAI->getCodePointerSize(); + default: + llvm_unreachable("DIE Value form not supported yet"); + } } /// EmitValue - Emit label value. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index ece6665e99f6c..704fff246b6a4 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -439,8 +439,8 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) { true}); DIELoc *Loc = new (DIEValueAllocator) DIELoc; addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_WASM_location); - addSInt(*Loc, dwarf::DW_FORM_sdata, FrameBase.Location.WasmLoc.Kind); - addLabel(*Loc, dwarf::DW_FORM_udata, SPSym); + addSInt(*Loc, dwarf::DW_FORM_sdata, TI_GLOBAL_RELOC); + addLabel(*Loc, dwarf::DW_FORM_data4, SPSym); DD->addArangeLabel(SymbolCU(this, SPSym)); addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value); addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc); @@ -678,9 +678,9 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, // Add variable address. - unsigned Offset = DV.getDebugLocListIndex(); - if (Offset != ~0U) { - addLocationList(*VariableDie, dwarf::DW_AT_location, Offset); + unsigned Index = DV.getDebugLocListIndex(); + if (Index != ~0U) { + addLocationList(*VariableDie, dwarf::DW_AT_location, Index); auto TagOffset = DV.getDebugLocListTagOffset(); if (TagOffset) addUInt(*VariableDie, dwarf::DW_AT_LLVM_tag_offset, dwarf::DW_FORM_data1, @@ -990,7 +990,7 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( } bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const { - return DD->getDwarfVersion() == 4 && DD->tuneForGDB(); + return DD->getDwarfVersion() == 4 && !DD->tuneForLLDB(); } dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 5a4c4dfb90a2a..1169adaaf470c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2488,7 +2488,6 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, DwarfExpr.addSignedConstant(Value.getInt()); else DwarfExpr.addUnsignedConstant(Value.getInt()); - DwarfExpr.addExpression(std::move(ExprCursor)); } else if (Value.isLocation()) { MachineLocation Location = Value.getLoc(); DwarfExpr.setLocation(Location, DIExpr); @@ -2509,24 +2508,10 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT, DwarfExpr.addExpression(std::move(ExprCursor)); return; } else if (Value.isConstantFP()) { - if (AP.getDwarfVersion() >= 4 && AP.getDwarfDebug()->tuneForGDB()) { - DwarfExpr.addConstantFP(Value.getConstantFP()->getValueAPF()); - return; - } else if (Value.getConstantFP() - ->getValueAPF() - .bitcastToAPInt() - .getBitWidth() <= 64 /*bits*/) { - DwarfExpr.addUnsignedConstant( - Value.getConstantFP()->getValueAPF().bitcastToAPInt()); - DwarfExpr.addExpression(std::move(ExprCursor)); - return; - } - LLVM_DEBUG( - dbgs() - << "Skipped DwarfExpression creation for ConstantFP of size: " - << Value.getConstantFP()->getValueAPF().bitcastToAPInt().getBitWidth() - << " bits\n"); + APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt(); + DwarfExpr.addUnsignedConstant(RawBytes); } + DwarfExpr.addExpression(std::move(ExprCursor)); } void DebugLocEntry::finalize(const AsmPrinter &AP, diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index 2587ee61b0599..34364134d92ac 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -114,7 +114,7 @@ class DbgEntity { /// /// Variables that have been optimized out use none of these fields. class DbgVariable : public DbgEntity { - /// Offset in DebugLocs. + /// Index of the entry list in DebugLocs. unsigned DebugLocListIndex = ~0u; /// DW_OP_LLVM_tag_offset value from DebugLocs. Optional DebugLocListTagOffset; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 7e6f4a0425d87..d4762121d1050 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -25,8 +25,6 @@ using namespace llvm; -#define DEBUG_TYPE "dwarfdebug" - void DwarfExpression::emitConstu(uint64_t Value) { if (Value < 32) emitOp(dwarf::DW_OP_lit0 + Value); @@ -221,36 +219,6 @@ void DwarfExpression::addUnsignedConstant(const APInt &Value) { } } -void DwarfExpression::addConstantFP(const APFloat &Value) { - assert(isImplicitLocation() || isUnknownLocation()); - APInt RawBytes = Value.bitcastToAPInt(); - int NumBytes = RawBytes.getBitWidth() / 8; - const char *Data = (const char *)RawBytes.getRawData(); - emitOp(dwarf::DW_OP_implicit_value); - if (NumBytes == 4 /*float*/ || NumBytes == 8 /*double*/) { - emitUnsigned(NumBytes /*Size of the block in bytes*/); - for (int i = 0; i < NumBytes; ++i) - emitData1(Data[i]); - return; - } - if (NumBytes == 10 /*long double*/) { - // long double IEEE representation uses 80 bits(10 bytes). - // 6 bytes are padded to make it 128 bits(16 bytes) due to - // addressing restrictions. - emitUnsigned(16 /*Size of the block in bytes*/); - // Emit the block of bytes. - for (int i = 0; i < NumBytes; ++i) - emitData1(Data[i]); - // Emit the rest as padding bytes. - for (int i = 0; i < 16 - NumBytes; ++i) - emitData1(0); - return; - } - LLVM_DEBUG( - dbgs() << "Skipped DW_OP_implicit_value creation for ConstantFP of size: " - << RawBytes.getBitWidth() << " bits\n"); -} - bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, DIExpressionCursor &ExprCursor, unsigned MachineReg, diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index 097208f1cfade..757b175114535 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -299,9 +299,6 @@ class DwarfExpression { /// Emit an unsigned constant. void addUnsignedConstant(const APInt &Value); - /// Emit floating point constant. - void addConstantFP(const APFloat &Value); - /// Lock this down to become a memory location description. void setMemoryLocationKind() { assert(isUnknownLocation()); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 1ba2afe48bf4b..2b45e50869edf 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -335,7 +335,7 @@ void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) { } addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr); - addLabel(Die, dwarf::DW_FORM_udata, Sym); + addLabel(Die, dwarf::DW_FORM_addr, Sym); } void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute, diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index a5030305435c1..c61531c5141a9 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -1239,7 +1239,8 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { Value *NewValueInsert = insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV); Value *StoreSuccess = - TLI->emitStoreConditional(Builder, NewValueInsert, Addr, MemOpOrder); + TLI->emitStoreConditional(Builder, NewValueInsert, PMV.AlignedAddr, + MemOpOrder); StoreSuccess = Builder.CreateICmpEQ( StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success"); BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB; diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 42cffafbb1ce9..a85ac80ef3652 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -4322,7 +4322,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned SrcAS = AddrInst->getOperand(0)->getType()->getPointerAddressSpace(); unsigned DestAS = AddrInst->getType()->getPointerAddressSpace(); - if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) + if (TLI.getTargetMachine().isNoopAddrSpaceCast(SrcAS, DestAS)) return matchAddr(AddrInst->getOperand(0), Depth); return false; } diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 12dadf97e02c6..0ada09f469b76 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -332,7 +332,7 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { CGBINDOPT(FunctionSections); static cl::opt BBSections( - "basicblock-sections", + "basic-block-sections", cl::desc("Emit basic blocks into separate sections"), cl::value_desc("all | | labels | none"), cl::init("none")); @@ -352,7 +352,7 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { CGBINDOPT(UniqueSectionNames); static cl::opt UniqueBasicBlockSectionNames( - "unique-bb-section-names", + "unique-basic-block-section-names", cl::desc("Give unique names to every basic block section"), cl::init(false)); CGBINDOPT(UniqueBasicBlockSectionNames); diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp index 6f8fd309e1b58..9048583ff7281 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -13,6 +13,7 @@ #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/IR/DebugInfoMetadata.h" using namespace llvm; @@ -134,6 +135,21 @@ CSEMIRBuilder::generateCopiesIfRequired(ArrayRef DstOps, if (Op.getDstOpKind() == DstOp::DstType::Ty_Reg) return buildCopy(Op.getReg(), MIB.getReg(0)); } + + // If we didn't generate a copy then we're re-using an existing node directly + // instead of emitting any code. Merge the debug location we wanted to emit + // into the instruction we're CSE'ing with. Debug locations arent part of the + // profile so we don't need to recompute it. + if (getDebugLoc()) { + GISelChangeObserver *Observer = getState().Observer; + if (Observer) + Observer->changingInstr(*MIB); + MIB->setDebugLoc( + DILocation::getMergedLocation(MIB->getDebugLoc(), getDebugLoc())); + if (Observer) + Observer->changedInstr(*MIB); + } + return MIB; } diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 422b71b28b6fe..a9ec1d4365a34 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1545,6 +1545,39 @@ bool CombinerHelper::tryCombineShiftToUnmerge(MachineInstr &MI, return false; } +bool CombinerHelper::matchCombineI2PToP2I(MachineInstr &MI, Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR"); + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + Register SrcReg = MI.getOperand(1).getReg(); + return mi_match(SrcReg, MRI, + m_GPtrToInt(m_all_of(m_SpecificType(DstTy), m_Reg(Reg)))); +} + +bool CombinerHelper::applyCombineI2PToP2I(MachineInstr &MI, Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR"); + Register DstReg = MI.getOperand(0).getReg(); + Builder.setInstr(MI); + Builder.buildCopy(DstReg, Reg); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::matchCombineP2IToI2P(MachineInstr &MI, Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_PTRTOINT && "Expected a G_PTRTOINT"); + Register SrcReg = MI.getOperand(1).getReg(); + return mi_match(SrcReg, MRI, m_GIntToPtr(m_Reg(Reg))); +} + +bool CombinerHelper::applyCombineP2IToI2P(MachineInstr &MI, Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_PTRTOINT && "Expected a G_PTRTOINT"); + Register DstReg = MI.getOperand(0).getReg(); + Builder.setInstr(MI); + Builder.buildZExtOrTrunc(DstReg, Reg); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) { return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) { return MO.isReg() && diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index aa898d5a61896..b14492ce01236 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -222,8 +222,9 @@ ArrayRef IRTranslator::getOrCreateVRegs(const Value &Val) { } int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) { - if (FrameIndices.find(&AI) != FrameIndices.end()) - return FrameIndices[&AI]; + auto MapEntry = FrameIndices.find(&AI); + if (MapEntry != FrameIndices.end()) + return MapEntry->second; uint64_t ElementSize = DL->getTypeAllocSize(AI.getAllocatedType()); uint64_t Size = @@ -1214,6 +1215,16 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op, return true; } +bool IRTranslator::translateFixedPointIntrinsic(unsigned Op, const CallInst &CI, + MachineIRBuilder &MIRBuilder) { + Register Dst = getOrCreateVReg(CI); + Register Src0 = getOrCreateVReg(*CI.getOperand(0)); + Register Src1 = getOrCreateVReg(*CI.getOperand(1)); + uint64_t Scale = cast(CI.getOperand(2))->getZExtValue(); + MIRBuilder.buildInstr(Op, {Dst}, { Src0, Src1, Scale }); + return true; +} + unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { switch (ID) { default: @@ -1270,6 +1281,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { return TargetOpcode::G_FRINT; case Intrinsic::round: return TargetOpcode::G_INTRINSIC_ROUND; + case Intrinsic::roundeven: + return TargetOpcode::G_INTRINSIC_ROUNDEVEN; case Intrinsic::sin: return TargetOpcode::G_FSIN; case Intrinsic::sqrt: @@ -1280,6 +1293,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { return TargetOpcode::G_READCYCLECOUNTER; case Intrinsic::ptrmask: return TargetOpcode::G_PTRMASK; + case Intrinsic::lrint: + return TargetOpcode::G_INTRINSIC_LRINT; } return Intrinsic::not_intrinsic; } @@ -1372,7 +1387,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, // Get the underlying objects for the location passed on the lifetime // marker. SmallVector Allocas; - GetUnderlyingObjects(CI.getArgOperand(1), Allocas, *DL); + getUnderlyingObjects(CI.getArgOperand(1), Allocas); // Iterate over each underlying object, creating lifetime markers for each // static alloca. Quit if we find a non-static alloca. @@ -1494,6 +1509,22 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateBinaryOp(TargetOpcode::G_SMIN, CI, MIRBuilder); case Intrinsic::smax: return translateBinaryOp(TargetOpcode::G_SMAX, CI, MIRBuilder); + case Intrinsic::smul_fix: + return translateFixedPointIntrinsic(TargetOpcode::G_SMULFIX, CI, MIRBuilder); + case Intrinsic::umul_fix: + return translateFixedPointIntrinsic(TargetOpcode::G_UMULFIX, CI, MIRBuilder); + case Intrinsic::smul_fix_sat: + return translateFixedPointIntrinsic(TargetOpcode::G_SMULFIXSAT, CI, MIRBuilder); + case Intrinsic::umul_fix_sat: + return translateFixedPointIntrinsic(TargetOpcode::G_UMULFIXSAT, CI, MIRBuilder); + case Intrinsic::sdiv_fix: + return translateFixedPointIntrinsic(TargetOpcode::G_SDIVFIX, CI, MIRBuilder); + case Intrinsic::udiv_fix: + return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIX, CI, MIRBuilder); + case Intrinsic::sdiv_fix_sat: + return translateFixedPointIntrinsic(TargetOpcode::G_SDIVFIXSAT, CI, MIRBuilder); + case Intrinsic::udiv_fix_sat: + return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIXSAT, CI, MIRBuilder); case Intrinsic::fmuladd: { const TargetMachine &TM = MF->getTarget(); const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering(); @@ -1517,6 +1548,18 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, } return true; } + case Intrinsic::convert_from_fp16: + // FIXME: This intrinsic should probably be removed from the IR. + MIRBuilder.buildFPExt(getOrCreateVReg(CI), + getOrCreateVReg(*CI.getArgOperand(0)), + MachineInstr::copyFlagsFromInstruction(CI)); + return true; + case Intrinsic::convert_to_fp16: + // FIXME: This intrinsic should probably be removed from the IR. + MIRBuilder.buildFPTrunc(getOrCreateVReg(CI), + getOrCreateVReg(*CI.getArgOperand(0)), + MachineInstr::copyFlagsFromInstruction(CI)); + return true; case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: @@ -1603,6 +1646,16 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, } case Intrinsic::invariant_end: return true; + case Intrinsic::expect: + case Intrinsic::annotation: + case Intrinsic::ptr_annotation: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: { + // Drop the intrinsic, but forward the value. + MIRBuilder.buildCopy(getOrCreateVReg(CI), + getOrCreateVReg(*CI.getArgOperand(0))); + return true; + } case Intrinsic::assume: case Intrinsic::var_annotation: case Intrinsic::sideeffect: @@ -1732,10 +1785,6 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { MIB->copyIRFlags(CI); for (auto &Arg : enumerate(CI.arg_operands())) { - // Some intrinsics take metadata parameters. Reject them. - if (isa(Arg.value())) - return false; - // If this is required to be an immediate, don't materialize it in a // register. if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) { @@ -1748,6 +1797,11 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { } else { MIB.addFPImm(cast(Arg.value())); } + } else if (auto MD = dyn_cast(Arg.value())) { + auto *MDN = dyn_cast(MD->getMetadata()); + if (!MDN) // This was probably an MDString. + return false; + MIB.addMetadata(MDN); } else { ArrayRef VRegs = getOrCreateVRegs(*Arg.value()); if (VRegs.size() > 1) diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp index 502f3cb85726c..7acf9c843235e 100644 --- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp @@ -658,6 +658,7 @@ bool InlineAsmLowering::lowerAsmOperandForConstraint( default: return false; case 'i': // Simple Integer or Relocatable Constant + case 'n': // immediate integer with a known value. if (ConstantInt *CI = dyn_cast(Val)) { assert(CI->getBitWidth() <= 64 && "expected immediate to fit into 64-bits"); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp index a83742f2138fc..17bce517814de 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp @@ -24,7 +24,7 @@ LegalityPredicates::typeInSet(unsigned TypeIdx, std::initializer_list TypesInit) { SmallVector Types = TypesInit; return [=](const LegalityQuery &Query) { - return std::find(Types.begin(), Types.end(), Query.Types[TypeIdx]) != Types.end(); + return llvm::is_contained(Types, Query.Types[TypeIdx]); }; } @@ -34,7 +34,7 @@ LegalityPredicate LegalityPredicates::typePairInSet( SmallVector, 4> Types = TypesInit; return [=](const LegalityQuery &Query) { std::pair Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1]}; - return std::find(Types.begin(), Types.end(), Match) != Types.end(); + return llvm::is_contained(Types, Match); }; } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 5dcb5b3271d8a..592f79aa7b712 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -16,6 +16,7 @@ #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -29,6 +30,7 @@ using namespace llvm; using namespace LegalizeActions; +using namespace MIPatternMatch; /// Try to break down \p OrigTy into \p NarrowTy sized pieces. /// @@ -75,6 +77,8 @@ static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) { return Type::getFloatTy(Ctx); case 64: return Type::getDoubleTy(Ctx); + case 80: + return Type::getX86_FP80Ty(Ctx); case 128: return Type::getFP128Ty(Ctx); default: @@ -384,7 +388,7 @@ void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy, } static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { -#define RTLIBCASE(LibcallPrefix) \ +#define RTLIBCASE_INT(LibcallPrefix) \ do { \ switch (Size) { \ case 32: \ @@ -398,19 +402,33 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { } \ } while (0) - assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size"); +#define RTLIBCASE(LibcallPrefix) \ + do { \ + switch (Size) { \ + case 32: \ + return RTLIB::LibcallPrefix##32; \ + case 64: \ + return RTLIB::LibcallPrefix##64; \ + case 80: \ + return RTLIB::LibcallPrefix##80; \ + case 128: \ + return RTLIB::LibcallPrefix##128; \ + default: \ + llvm_unreachable("unexpected size"); \ + } \ + } while (0) switch (Opcode) { case TargetOpcode::G_SDIV: - RTLIBCASE(SDIV_I); + RTLIBCASE_INT(SDIV_I); case TargetOpcode::G_UDIV: - RTLIBCASE(UDIV_I); + RTLIBCASE_INT(UDIV_I); case TargetOpcode::G_SREM: - RTLIBCASE(SREM_I); + RTLIBCASE_INT(SREM_I); case TargetOpcode::G_UREM: - RTLIBCASE(UREM_I); + RTLIBCASE_INT(UREM_I); case TargetOpcode::G_CTLZ_ZERO_UNDEF: - RTLIBCASE(CTLZ_I); + RTLIBCASE_INT(CTLZ_I); case TargetOpcode::G_FADD: RTLIBCASE(ADD_F); case TargetOpcode::G_FSUB: @@ -453,6 +471,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(RINT_F); case TargetOpcode::G_FNEARBYINT: RTLIBCASE(NEARBYINT_F); + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: + RTLIBCASE(ROUNDEVEN_F); } llvm_unreachable("Unknown libcall function"); } @@ -668,10 +688,11 @@ LegalizerHelper::libcall(MachineInstr &MI) { case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FSQRT: case TargetOpcode::G_FRINT: - case TargetOpcode::G_FNEARBYINT: { + case TargetOpcode::G_FNEARBYINT: + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); - if (!HLTy || (Size != 32 && Size != 64 && Size != 128)) { - LLVM_DEBUG(dbgs() << "No libcall available for size " << Size << ".\n"); + if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) { + LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n"); return UnableToLegalize; } auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); @@ -1204,6 +1225,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, MI.eraseFromParent(); return Legalized; } + case TargetOpcode::G_PTR_ADD: case TargetOpcode::G_PTRMASK: { if (TypeIdx != 1) return UnableToLegalize; @@ -1295,10 +1317,8 @@ void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy, void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx) { MachineOperand &MO = MI.getOperand(OpIdx); - Register DstExt = MRI.createGenericVirtualRegister(WideTy); MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); - MIRBuilder.buildExtract(MO, DstExt, 0); - MO.setReg(DstExt); + MO.setReg(widenWithUnmerge(WideTy, MO.getReg())); } void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, @@ -1466,6 +1486,40 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, return Legalized; } +Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) { + Register WideReg = MRI.createGenericVirtualRegister(WideTy); + LLT OrigTy = MRI.getType(OrigReg); + LLT LCMTy = getLCMType(WideTy, OrigTy); + + const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits(); + const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits(); + + Register UnmergeSrc = WideReg; + + // Create a merge to the LCM type, padding with undef + // %0:_(<3 x s32>) = G_FOO => <4 x s32> + // => + // %1:_(<4 x s32>) = G_FOO + // %2:_(<4 x s32>) = G_IMPLICIT_DEF + // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2 + // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3 + if (NumMergeParts > 1) { + Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0); + SmallVector MergeParts(NumMergeParts, Undef); + MergeParts[0] = WideReg; + UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0); + } + + // Unmerge to the original register and pad with dead defs. + SmallVector UnmergeResults(NumUnmergeParts); + UnmergeResults[0] = OrigReg; + for (int I = 1; I != NumUnmergeParts; ++I) + UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy); + + MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc); + return WideReg; +} + LegalizerHelper::LegalizeResult LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { @@ -1613,8 +1667,7 @@ LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { Src = MIRBuilder.buildAnyExt(WideTy, Src); ShiftTy = WideTy; - } else if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) - return UnableToLegalize; + } auto LShr = MIRBuilder.buildLShr( ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset)); @@ -2161,6 +2214,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_FPOW: case TargetOpcode::G_INTRINSIC_TRUNC: case TargetOpcode::G_INTRINSIC_ROUND: + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: assert(TypeIdx == 0); Observer.changingInstr(MI); @@ -2308,6 +2362,122 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) { return UnableToLegalize; } +/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this +/// is casting to a vector with a smaller element size, perform multiple element +/// extracts and merge the results. If this is coercing to a vector with larger +/// elements, index the bitcasted vector and extract the target element with bit +/// operations. This is intended to force the indexing in the native register +/// size for architectures that can dynamically index the register file. +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy) { + if (TypeIdx != 1) + return UnableToLegalize; + + Register Dst = MI.getOperand(0).getReg(); + Register SrcVec = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + LLT SrcVecTy = MRI.getType(SrcVec); + LLT IdxTy = MRI.getType(Idx); + + LLT SrcEltTy = SrcVecTy.getElementType(); + unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; + unsigned OldNumElts = SrcVecTy.getNumElements(); + + LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; + Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); + + const unsigned NewEltSize = NewEltTy.getSizeInBits(); + const unsigned OldEltSize = SrcEltTy.getSizeInBits(); + if (NewNumElts > OldNumElts) { + // Decreasing the vector element size + // + // e.g. i64 = extract_vector_elt x:v2i64, y:i32 + // => + // v4i32:castx = bitcast x:v2i64 + // + // i64 = bitcast + // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))), + // (i32 (extract_vector_elt castx, (2 * y + 1))) + // + if (NewNumElts % OldNumElts != 0) + return UnableToLegalize; + + // Type of the intermediate result vector. + const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts; + LLT MidTy = LLT::scalarOrVector(NewEltsPerOldElt, NewEltTy); + + auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt); + + SmallVector NewOps(NewEltsPerOldElt); + auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK); + + for (unsigned I = 0; I < NewEltsPerOldElt; ++I) { + auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I); + auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset); + auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx); + NewOps[I] = Elt.getReg(0); + } + + auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps); + MIRBuilder.buildBitcast(Dst, NewVec); + MI.eraseFromParent(); + return Legalized; + } + + if (NewNumElts < OldNumElts) { + if (NewEltSize % OldEltSize != 0) + return UnableToLegalize; + + // This only depends on powers of 2 because we use bit tricks to figure out + // the bit offset we need to shift to get the target element. A general + // expansion could emit division/multiply. + if (!isPowerOf2_32(NewEltSize / OldEltSize)) + return UnableToLegalize; + + // Increasing the vector element size. + // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(), %idx + // + // => + // + // %cast = G_BITCAST %vec + // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize) + // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx + // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) + // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) + // %elt_bits = G_LSHR %wide_elt, %offset_bits + // %elt = G_TRUNC %elt_bits + + const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); + auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); + + // Divide to get the index in the wider element type. + auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); + + Register WideElt = CastVec; + if (CastTy.isVector()) { + WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, + ScaledIdx).getReg(0); + } + + // Now figure out the amount we need to shift to get the target bits. + auto OffsetMask = MIRBuilder.buildConstant( + IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio)); + auto OffsetIdx = MIRBuilder.buildAnd(IdxTy, Idx, OffsetMask); + auto OffsetBits = MIRBuilder.buildShl( + IdxTy, OffsetIdx, + MIRBuilder.buildConstant(IdxTy, Log2_32(OldEltSize))); + + // Shift the wide element to get the target element. + auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits); + MIRBuilder.buildTrunc(Dst, ExtractedBits); + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} + LegalizerHelper::LegalizeResult LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { switch (MI.getOpcode()) { @@ -2356,11 +2526,20 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + return bitcastExtractVectorElt(MI, TypeIdx, CastTy); default: return UnableToLegalize; } } +// Legalize an instruction by changing the opcode in place. +void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) { + Observer.changingInstr(MI); + MI.setDesc(MIRBuilder.getTII().get(NewOpcode)); + Observer.changedInstr(MI); +} + LegalizerHelper::LegalizeResult LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { using namespace TargetOpcode; @@ -2459,6 +2638,12 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return lowerFFloor(MI); case TargetOpcode::G_INTRINSIC_ROUND: return lowerIntrinsicRound(MI); + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { + // Since round even is the assumed rounding mode for unconstrained FP + // operations, rint and roundeven are the same operation. + changeOpcode(MI, TargetOpcode::G_FRINT); + return Legalized; + } case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { Register OldValRes = MI.getOperand(0).getReg(); Register SuccessRes = MI.getOperand(1).getReg(); @@ -2713,6 +2898,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { MI.eraseFromParent(); return Legalized; } + case G_EXTRACT_VECTOR_ELT: + return lowerExtractVectorElt(MI); case G_SHUFFLE_VECTOR: return lowerShuffleVector(MI); case G_DYN_STACKALLOC: @@ -2752,30 +2939,78 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { } } +Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty, + Align MinAlign) const { + // FIXME: We're missing a way to go back from LLT to llvm::Type to query the + // datalayout for the preferred alignment. Also there should be a target hook + // for this to allow targets to reduce the alignment and ignore the + // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of + // the type. + return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign); +} + +MachineInstrBuilder +LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment, + MachinePointerInfo &PtrInfo) { + MachineFunction &MF = MIRBuilder.getMF(); + const DataLayout &DL = MIRBuilder.getDataLayout(); + int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false); + + unsigned AddrSpace = DL.getAllocaAddrSpace(); + LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); + + PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx); + return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx); +} + +static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg, + LLT VecTy) { + int64_t IdxVal; + if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) + return IdxReg; + + LLT IdxTy = B.getMRI()->getType(IdxReg); + unsigned NElts = VecTy.getNumElements(); + if (isPowerOf2_32(NElts)) { + APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts)); + return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0); + } + + return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1)) + .getReg(0); +} + +Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy, + Register Index) { + LLT EltTy = VecTy.getElementType(); + + // Calculate the element offset and add it to the pointer. + unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size. + assert(EltSize * 8 == EltTy.getSizeInBits() && + "Converting bits to bytes lost precision"); + + Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy); + + LLT IdxTy = MRI.getType(Index); + auto Mul = MIRBuilder.buildMul(IdxTy, Index, + MIRBuilder.buildConstant(IdxTy, EltSize)); + + LLT PtrTy = MRI.getType(VecPtr); + return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0); +} + LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef( MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { - SmallVector DstRegs; - - unsigned NarrowSize = NarrowTy.getSizeInBits(); Register DstReg = MI.getOperand(0).getReg(); - unsigned Size = MRI.getType(DstReg).getSizeInBits(); - int NumParts = Size / NarrowSize; - // FIXME: Don't know how to handle the situation where the small vectors - // aren't all the same size yet. - if (Size % NarrowSize != 0) - return UnableToLegalize; + LLT DstTy = MRI.getType(DstReg); + LLT LCMTy = getLCMType(DstTy, NarrowTy); - for (int i = 0; i < NumParts; ++i) { - Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); - MIRBuilder.buildUndef(TmpReg); - DstRegs.push_back(TmpReg); - } + unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits(); - if (NarrowTy.isVector()) - MIRBuilder.buildConcatVectors(DstReg, DstRegs); - else - MIRBuilder.buildBuildVector(DstReg, DstRegs); + auto NewUndef = MIRBuilder.buildUndef(NarrowTy); + SmallVector Parts(NumParts, NewUndef.getReg(0)); + buildWidenedRemergeToDst(DstReg, LCMTy, Parts); MI.eraseFromParent(); return Legalized; } @@ -3469,6 +3704,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_ADD: case G_SUB: case G_MUL: + case G_PTR_ADD: case G_SMULH: case G_UMULH: case G_FADD: @@ -3492,6 +3728,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_FFLOOR: case G_FRINT: case G_INTRINSIC_ROUND: + case G_INTRINSIC_ROUNDEVEN: case G_INTRINSIC_TRUNC: case G_FCOS: case G_FSIN: @@ -5121,6 +5358,57 @@ LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) { return Legalized; } +/// Lower a vector extract by writing the vector to a stack temporary and +/// reloading the element. +/// +/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx +/// => +/// %stack_temp = G_FRAME_INDEX +/// G_STORE %vec, %stack_temp +/// %idx = clamp(%idx, %vec.getNumElements()) +/// %element_ptr = G_PTR_ADD %stack_temp, %idx +/// %dst = G_LOAD %element_ptr +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerExtractVectorElt(MachineInstr &MI) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcVec = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + LLT VecTy = MRI.getType(SrcVec); + LLT EltTy = VecTy.getElementType(); + if (!EltTy.isByteSized()) { // Not implemented. + LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n"); + return UnableToLegalize; + } + + unsigned EltBytes = EltTy.getSizeInBytes(); + Align StoreAlign = getStackTemporaryAlignment(VecTy); + Align LoadAlign; + + MachinePointerInfo PtrInfo; + auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()), + StoreAlign, PtrInfo); + MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, StoreAlign); + + // Get the pointer to the element, and be sure not to hit undefined behavior + // if the index is out of bounds. + Register LoadPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx); + + int64_t IdxVal; + if (mi_match(Idx, MRI, m_ICst(IdxVal))) { + int64_t Offset = IdxVal * EltBytes; + PtrInfo = PtrInfo.getWithOffset(Offset); + LoadAlign = commonAlignment(StoreAlign, Offset); + } else { + // We lose information with a variable offset. + LoadAlign = getStackTemporaryAlignment(EltTy); + PtrInfo = MachinePointerInfo(MRI.getType(LoadPtr).getAddressSpace()); + } + + MIRBuilder.buildLoad(DstReg, LoadPtr, PtrInfo, LoadAlign); + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index 4abd0c4df97a2..d14682ccc054b 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -105,6 +105,7 @@ raw_ostream &LegalityQuery::print(raw_ostream &OS) const { static bool hasNoSimpleLoops(const LegalizeRule &Rule, const LegalityQuery &Q, const std::pair &Mutation) { switch (Rule.getAction()) { + case Legal: case Custom: case Lower: case MoreElements: @@ -122,7 +123,7 @@ static bool mutationIsSane(const LegalizeRule &Rule, std::pair Mutation) { // If the user wants a custom mutation, then we can't really say much about // it. Return true, and trust that they're doing the right thing. - if (Rule.getAction() == Custom) + if (Rule.getAction() == Custom || Rule.getAction() == Legal) return true; const unsigned TypeIdx = Mutation.first; diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 6a6137d466766..e5a1069ce5c1e 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -9,8 +9,8 @@ /// This file implements the MachineIRBuidler class. //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" - #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -319,10 +319,19 @@ MachineInstrBuilder MachineIRBuilder::buildBrCond(Register Tst, return buildInstr(TargetOpcode::G_BRCOND).addUse(Tst).addMBB(&Dest); } -MachineInstrBuilder MachineIRBuilder::buildLoad(const DstOp &Res, - const SrcOp &Addr, - MachineMemOperand &MMO) { - return buildLoadInstr(TargetOpcode::G_LOAD, Res, Addr, MMO); +MachineInstrBuilder +MachineIRBuilder::buildLoad(const DstOp &Dst, const SrcOp &Addr, + MachinePointerInfo PtrInfo, Align Alignment, + MachineMemOperand::Flags MMOFlags, + const AAMDNodes &AAInfo) { + MMOFlags |= MachineMemOperand::MOLoad; + assert((MMOFlags & MachineMemOperand::MOStore) == 0); + + uint64_t Size = MemoryLocation::getSizeOrUnknown( + TypeSize::Fixed(Dst.getLLTTy(*getMRI()).getSizeInBytes())); + MachineMemOperand *MMO = + getMF().getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo); + return buildLoad(Dst, Addr, *MMO); } MachineInstrBuilder MachineIRBuilder::buildLoadInstr(unsigned Opcode, @@ -369,6 +378,21 @@ MachineInstrBuilder MachineIRBuilder::buildStore(const SrcOp &Val, return MIB; } +MachineInstrBuilder +MachineIRBuilder::buildStore(const SrcOp &Val, const SrcOp &Addr, + MachinePointerInfo PtrInfo, Align Alignment, + MachineMemOperand::Flags MMOFlags, + const AAMDNodes &AAInfo) { + MMOFlags |= MachineMemOperand::MOStore; + assert((MMOFlags & MachineMemOperand::MOLoad) == 0); + + uint64_t Size = MemoryLocation::getSizeOrUnknown( + TypeSize::Fixed(Val.getLLTTy(*getMRI()).getSizeInBytes())); + MachineMemOperand *MMO = + getMF().getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo); + return buildStore(Val, Addr, *MMO); +} + MachineInstrBuilder MachineIRBuilder::buildAnyExt(const DstOp &Res, const SrcOp &Op) { return buildInstr(TargetOpcode::G_ANYEXT, Res, Op); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 584a691efb238..4a7513f23c6b9 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -604,3 +604,24 @@ LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) { unsigned GCD = greatestCommonDivisor(OrigSize, TargetSize); return LLT::scalar(GCD); } + +Optional llvm::getSplatIndex(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && + "Only G_SHUFFLE_VECTOR can have a splat index!"); + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; }); + + // If all elements are undefined, this shuffle can be considered a splat. + // Return 0 for better potential for callers to simplify. + if (FirstDefinedIdx == Mask.end()) + return 0; + + // Make sure all remaining elements are either undef or the same + // as the first non-undef value. + int SplatValue = *FirstDefinedIdx; + if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()), + [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; })) + return None; + + return SplatValue; +} diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 6c5ef0255a082..204fb556d8105 100644 --- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -220,6 +220,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) { continue; if (StackProtectorFI == (int)i) continue; + if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i))) + continue; switch (MFI.getObjectSSPLayout(i)) { case MachineFrameInfo::SSPLK_None: @@ -254,6 +256,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) { continue; if (ProtectedObjs.count(i)) continue; + if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i))) + continue; AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign); } diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index fa23df6288e99..dde0dc456c05f 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -608,58 +608,10 @@ bool MIPrinter::canPredictSuccessors(const MachineBasicBlock &MBB) const { void MIPrinter::print(const MachineBasicBlock &MBB) { assert(MBB.getNumber() >= 0 && "Invalid MBB number"); - OS << "bb." << MBB.getNumber(); - bool HasAttributes = false; - if (const auto *BB = MBB.getBasicBlock()) { - if (BB->hasName()) { - OS << "." << BB->getName(); - } else { - HasAttributes = true; - OS << " ("; - int Slot = MST.getLocalSlot(BB); - if (Slot == -1) - OS << ""; - else - OS << (Twine("%ir-block.") + Twine(Slot)).str(); - } - } - if (MBB.hasAddressTaken()) { - OS << (HasAttributes ? ", " : " ("); - OS << "address-taken"; - HasAttributes = true; - } - if (MBB.isEHPad()) { - OS << (HasAttributes ? ", " : " ("); - OS << "landing-pad"; - HasAttributes = true; - } - if (MBB.isEHFuncletEntry()) { - OS << (HasAttributes ? ", " : " ("); - OS << "ehfunclet-entry"; - HasAttributes = true; - } - if (MBB.getAlignment() != Align(1)) { - OS << (HasAttributes ? ", " : " ("); - OS << "align " << MBB.getAlignment().value(); - HasAttributes = true; - } - if (MBB.getSectionID() != MBBSectionID(0)) { - OS << (HasAttributes ? ", " : " ("); - OS << "bbsections "; - switch (MBB.getSectionID().Type) { - case MBBSectionID::SectionType::Exception: - OS << "Exception"; - break; - case MBBSectionID::SectionType::Cold: - OS << "Cold"; - break; - default: - OS << MBB.getSectionID().Number; - } - HasAttributes = true; - } - if (HasAttributes) - OS << ")"; + MBB.printName(OS, + MachineBasicBlock::PrintNameIr | + MachineBasicBlock::PrintNameAttributes, + &MST); OS << ":\n"; bool HasLineAttributes = false; diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 2d4b60435d962..626c04074a617 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -338,39 +338,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, if (Indexes && PrintSlotIndexes) OS << Indexes->getMBBStartIdx(this) << '\t'; - OS << "bb." << getNumber(); - bool HasAttributes = false; - if (const auto *BB = getBasicBlock()) { - if (BB->hasName()) { - OS << "." << BB->getName(); - } else { - HasAttributes = true; - OS << " ("; - int Slot = MST.getLocalSlot(BB); - if (Slot == -1) - OS << ""; - else - OS << (Twine("%ir-block.") + Twine(Slot)).str(); - } - } - - if (hasAddressTaken()) { - OS << (HasAttributes ? ", " : " ("); - OS << "address-taken"; - HasAttributes = true; - } - if (isEHPad()) { - OS << (HasAttributes ? ", " : " ("); - OS << "landing-pad"; - HasAttributes = true; - } - if (getAlignment() != Align(1)) { - OS << (HasAttributes ? ", " : " ("); - OS << "align " << Log2(getAlignment()); - HasAttributes = true; - } - if (HasAttributes) - OS << ")"; + printName(OS, PrintNameIr | PrintNameAttributes, &MST); OS << ":\n"; const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); @@ -478,9 +446,99 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, } } +/// Print the basic block's name as: +/// +/// bb.{number}[.{ir-name}] [(attributes...)] +/// +/// The {ir-name} is only printed when the \ref PrintNameIr flag is passed +/// (which is the default). If the IR block has no name, it is identified +/// numerically using the attribute syntax as "(%ir-block.{ir-slot})". +/// +/// When the \ref PrintNameAttributes flag is passed, additional attributes +/// of the block are printed when set. +/// +/// \param printNameFlags Combination of \ref PrintNameFlag flags indicating +/// the parts to print. +/// \param moduleSlotTracker Optional ModuleSlotTracker. This method will +/// incorporate its own tracker when necessary to +/// determine the block's IR name. +void MachineBasicBlock::printName(raw_ostream &os, unsigned printNameFlags, + ModuleSlotTracker *moduleSlotTracker) const { + os << "bb." << getNumber(); + bool hasAttributes = false; + + if (printNameFlags & PrintNameIr) { + if (const auto *bb = getBasicBlock()) { + if (bb->hasName()) { + os << '.' << bb->getName(); + } else { + hasAttributes = true; + os << " ("; + + int slot = -1; + + if (moduleSlotTracker) { + slot = moduleSlotTracker->getLocalSlot(bb); + } else if (bb->getParent()) { + ModuleSlotTracker tmpTracker(bb->getModule(), false); + tmpTracker.incorporateFunction(*bb->getParent()); + slot = tmpTracker.getLocalSlot(bb); + } + + if (slot == -1) + os << ""; + else + os << (Twine("%ir-block.") + Twine(slot)).str(); + } + } + } + + if (printNameFlags & PrintNameAttributes) { + if (hasAddressTaken()) { + os << (hasAttributes ? ", " : " ("); + os << "address-taken"; + hasAttributes = true; + } + if (isEHPad()) { + os << (hasAttributes ? ", " : " ("); + os << "landing-pad"; + hasAttributes = true; + } + if (isEHFuncletEntry()) { + os << (hasAttributes ? ", " : " ("); + os << "ehfunclet-entry"; + hasAttributes = true; + } + if (getAlignment() != Align(1)) { + os << (hasAttributes ? ", " : " ("); + os << "align " << getAlignment().value(); + hasAttributes = true; + } + if (getSectionID() != MBBSectionID(0)) { + os << (hasAttributes ? ", " : " ("); + os << "bbsections "; + switch (getSectionID().Type) { + case MBBSectionID::SectionType::Exception: + os << "Exception"; + break; + case MBBSectionID::SectionType::Cold: + os << "Cold"; + break; + default: + os << getSectionID().Number; + } + hasAttributes = true; + } + } + + if (hasAttributes) + os << ')'; +} + void MachineBasicBlock::printAsOperand(raw_ostream &OS, bool /*PrintType*/) const { - OS << "%bb." << getNumber(); + OS << '%'; + printName(OS, 0); } void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) { diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp index c836997060149..54e0a14e05557 100644 --- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -247,10 +247,15 @@ bool MachineBlockFrequencyInfo::isIrrLoopHeader( return MBFI->isIrrLoopHeader(MBB); } -void MachineBlockFrequencyInfo::setBlockFreq(const MachineBasicBlock *MBB, - uint64_t Freq) { +void MachineBlockFrequencyInfo::onEdgeSplit( + const MachineBasicBlock &NewPredecessor, + const MachineBasicBlock &NewSuccessor, + const MachineBranchProbabilityInfo &MBPI) { assert(MBFI && "Expected analysis to be available"); - MBFI->setBlockFreq(MBB, Freq); + auto NewSuccFreq = MBFI->getBlockFreq(&NewPredecessor) * + MBPI.getEdgeProbability(&NewPredecessor, &NewSuccessor); + + MBFI->setBlockFreq(&NewSuccessor, NewSuccFreq.getFrequency()); } const MachineFunction *MachineBlockFrequencyInfo::getFunction() const { diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 70d6dcc2e3e29..67d15129b9045 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -288,6 +288,8 @@ class MachineCopyPropagation : public MachineFunctionPass { const MachineInstr &UseI, unsigned UseIdx); bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use); + bool hasOverlappingMultipleDef(const MachineInstr &MI, + const MachineOperand &MODef, Register Def); /// Candidates for deletion. SmallSetVector MaybeDeadCopies; @@ -461,6 +463,21 @@ bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI, return false; } +/// For an MI that has multiple definitions, check whether \p MI has +/// a definition that overlaps with another of its definitions. +/// For example, on ARM: umull r9, r9, lr, r0 +/// The umull instruction is unpredictable unless RdHi and RdLo are different. +bool MachineCopyPropagation::hasOverlappingMultipleDef( + const MachineInstr &MI, const MachineOperand &MODef, Register Def) { + for (const MachineOperand &MIDef : MI.defs()) { + if ((&MIDef != &MODef) && MIDef.isReg() && + TRI->regsOverlap(Def, MIDef.getReg())) + return true; + } + + return false; +} + /// Look for available copies whose destination register is used by \p MI and /// replace the use in \p MI with the copy's source register. void MachineCopyPropagation::forwardUses(MachineInstr &MI) { @@ -786,6 +803,9 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) { if (hasImplicitOverlap(MI, MODef)) continue; + if (hasOverlappingMultipleDef(MI, MODef, Def)) + continue; + LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI) << "\n with " << printReg(Def, TRI) << "\n in " << MI << " from " << *Copy); diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp index f866c7ca53c68..be08f0ae31171 100644 --- a/llvm/lib/CodeGen/MachineModuleInfo.cpp +++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp @@ -178,7 +178,8 @@ void MachineModuleInfo::finalize() { MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI) : TM(std::move(MMI.TM)), Context(MMI.TM.getMCAsmInfo(), MMI.TM.getMCRegisterInfo(), - MMI.TM.getObjFileLowering(), nullptr, nullptr, false) { + MMI.TM.getObjFileLowering(), nullptr, nullptr, false), + MachineFunctions(std::move(MMI.MachineFunctions)) { ObjFileMMI = MMI.ObjFileMMI; CurCallSite = MMI.CurCallSite; UsesMSVCFloatingPoint = MMI.UsesMSVCFloatingPoint; diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index ef4b02ca9e3ef..45a5ef71d0fda 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -705,14 +705,13 @@ static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) { /// This function calls the code in ValueTracking, but first checks that the /// instruction has a memory operand. static void getUnderlyingObjects(const MachineInstr *MI, - SmallVectorImpl &Objs, - const DataLayout &DL) { + SmallVectorImpl &Objs) { if (!MI->hasOneMemOperand()) return; MachineMemOperand *MM = *MI->memoperands_begin(); if (!MM->getValue()) return; - GetUnderlyingObjects(MM->getValue(), Objs, DL); + getUnderlyingObjects(MM->getValue(), Objs); for (const Value *V : Objs) { if (!isIdentifiedObject(V)) { Objs.clear(); @@ -736,7 +735,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { PendingLoads.clear(); else if (MI.mayLoad()) { SmallVector Objs; - getUnderlyingObjects(&MI, Objs, MF.getDataLayout()); + ::getUnderlyingObjects(&MI, Objs); if (Objs.empty()) Objs.push_back(UnknownValue); for (auto V : Objs) { @@ -745,7 +744,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { } } else if (MI.mayStore()) { SmallVector Objs; - getUnderlyingObjects(&MI, Objs, MF.getDataLayout()); + ::getUnderlyingObjects(&MI, Objs); if (Objs.empty()) Objs.push_back(UnknownValue); for (auto V : Objs) { diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index d9d0a783f8a22..7daaa3526aa3a 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1298,7 +1298,7 @@ void ScheduleDAGMILive::computeDFSResult() { /// The cyclic path estimation identifies a def-use pair that crosses the back /// edge and considers the depth and height of the nodes. For example, consider /// the following instruction sequence where each instruction has unit latency -/// and defines an epomymous virtual register: +/// and defines an eponymous virtual register: /// /// a->b(a,c)->c(b)->d(c)->exit /// @@ -1653,7 +1653,13 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { unsigned ChainPredID = DAG->SUnits.size(); for (const SDep &Pred : SU.Preds) { - if (Pred.isCtrl() && !Pred.isArtificial()) { + // We only want to cluster the mem ops that have the same ctrl(non-data) + // pred so that they didn't have ctrl dependency for each other. But for + // store instrs, we can still cluster them if the pred is load instr. + if ((Pred.isCtrl() && + (IsLoad || + (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && + !Pred.isArtificial()) { ChainPredID = Pred.getSUnit()->NodeNum; break; } diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 5f958bbc31b7a..dfb88f6bf5170 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -347,11 +347,9 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) { << printMBBReference(*Pair.first) << " -- " << printMBBReference(*NewSucc) << " -- " << printMBBReference(*Pair.second) << '\n'); - if (MBFI) { - auto NewSuccFreq = MBFI->getBlockFreq(Pair.first) * - MBPI->getEdgeProbability(Pair.first, NewSucc); - MBFI->setBlockFreq(NewSucc, NewSuccFreq.getFrequency()); - } + if (MBFI) + MBFI->onEdgeSplit(*Pair.first, *NewSucc, *MBPI); + MadeChange = true; ++NumSplit; } else diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index c1a2c4e0bc6e6..238df43da9f5c 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -132,7 +132,8 @@ namespace { bool reachable = false; // Vregs that must be live in because they are used without being - // defined. Map value is the user. + // defined. Map value is the user. vregsLiveIn doesn't include regs + // that only are used by PHI nodes. RegMap vregsLiveIn; // Regs killed in MBB. They may be defined again, and will then be in both @@ -2229,63 +2230,28 @@ class FilteringVRegSet { // can pass through an MBB live, but may not be live every time. It is assumed // that all vregsPassed sets are empty before the call. void MachineVerifier::calcRegsPassed() { - // This is a forward dataflow, doing it in RPO. A standard map serves as a - // priority (sorting by RPO number) queue, deduplicating worklist, and an RPO - // number to MBB mapping all at once. - std::map RPOWorklist; - DenseMap RPONumbers; - if (MF->empty()) { + if (MF->empty()) // ReversePostOrderTraversal doesn't handle empty functions. return; - } - std::vector VRegsPassedSets(MF->size()); - for (const MachineBasicBlock *MBB : - ReversePostOrderTraversal(MF)) { - // Careful with the evaluation order, fetch next number before allocating. - unsigned Number = RPONumbers.size(); - RPONumbers[MBB] = Number; - // Set-up the transfer functions for all blocks. - const BBInfo &MInfo = MBBInfoMap[MBB]; - VRegsPassedSets[Number].addToFilter(MInfo.regsKilled); - VRegsPassedSets[Number].addToFilter(MInfo.regsLiveOut); - } - // First push live-out regs to successors' vregsPassed. Remember the MBBs that - // have any vregsPassed. - for (const MachineBasicBlock &MBB : *MF) { - const BBInfo &MInfo = MBBInfoMap[&MBB]; - if (!MInfo.reachable) - continue; - for (const MachineBasicBlock *Succ : MBB.successors()) { - unsigned SuccNumber = RPONumbers[Succ]; - FilteringVRegSet &SuccSet = VRegsPassedSets[SuccNumber]; - if (SuccSet.add(MInfo.regsLiveOut)) - RPOWorklist.emplace(SuccNumber, Succ); - } - } - // Iteratively push vregsPassed to successors. - while (!RPOWorklist.empty()) { - auto Next = RPOWorklist.begin(); - const MachineBasicBlock *MBB = Next->second; - RPOWorklist.erase(Next); - FilteringVRegSet &MSet = VRegsPassedSets[RPONumbers[MBB]]; - for (const MachineBasicBlock *Succ : MBB->successors()) { - if (Succ == MBB) + for (const MachineBasicBlock *MB : + ReversePostOrderTraversal(MF)) { + FilteringVRegSet VRegs; + BBInfo &Info = MBBInfoMap[MB]; + assert(Info.reachable); + + VRegs.addToFilter(Info.regsKilled); + VRegs.addToFilter(Info.regsLiveOut); + for (const MachineBasicBlock *Pred : MB->predecessors()) { + const BBInfo &PredInfo = MBBInfoMap[Pred]; + if (!PredInfo.reachable) continue; - unsigned SuccNumber = RPONumbers[Succ]; - FilteringVRegSet &SuccSet = VRegsPassedSets[SuccNumber]; - if (SuccSet.add(MSet)) - RPOWorklist.emplace(SuccNumber, Succ); + + VRegs.add(PredInfo.regsLiveOut); + VRegs.add(PredInfo.vregsPassed); } - } - // Copy the results back to BBInfos. - for (const MachineBasicBlock &MBB : *MF) { - BBInfo &MInfo = MBBInfoMap[&MBB]; - if (!MInfo.reachable) - continue; - const FilteringVRegSet &MSet = VRegsPassedSets[RPONumbers[&MBB]]; - MInfo.vregsPassed.reserve(MSet.size()); - MInfo.vregsPassed.insert(MSet.begin(), MSet.end()); + Info.vregsPassed.reserve(VRegs.size()); + Info.vregsPassed.insert(VRegs.begin(), VRegs.end()); } } @@ -2302,6 +2268,23 @@ void MachineVerifier::calcRegsRequired() { if (PInfo.addRequired(MInfo.vregsLiveIn)) todo.insert(Pred); } + + // Handle the PHI node. + for (const MachineInstr &MI : MBB.phis()) { + for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { + // Skip those Operands which are undef regs or not regs. + if (!MI.getOperand(i).isReg() || !MI.getOperand(i).readsReg()) + continue; + + // Get register and predecessor for one PHI edge. + Register Reg = MI.getOperand(i).getReg(); + const MachineBasicBlock *Pred = MI.getOperand(i + 1).getMBB(); + + BBInfo &PInfo = MBBInfoMap[Pred]; + if (PInfo.addRequired(Reg)) + todo.insert(Pred); + } + } } // Iteratively push vregsRequired to predecessors. This will converge to the diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp index 311b87fa9e3b0..521ae367cde33 100644 --- a/llvm/lib/CodeGen/PHIElimination.cpp +++ b/llvm/lib/CodeGen/PHIElimination.cpp @@ -324,21 +324,43 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB, // Increment use count of the newly created virtual register. LV->setPHIJoin(IncomingReg); - // When we are reusing the incoming register, it may already have been - // killed in this block. The old kill will also have been inserted at - // AfterPHIsIt, so it appears before the current PHICopy. - if (reusedIncoming) - if (MachineInstr *OldKill = VI.findKill(&MBB)) { - LLVM_DEBUG(dbgs() << "Remove old kill from " << *OldKill); - LV->removeVirtualRegisterKilled(IncomingReg, *OldKill); - LLVM_DEBUG(MBB.dump()); + MachineInstr *OldKill = nullptr; + bool IsPHICopyAfterOldKill = false; + + if (reusedIncoming && (OldKill = VI.findKill(&MBB))) { + // Calculate whether the PHICopy is after the OldKill. + // In general, the PHICopy is inserted as the first non-phi instruction + // by default, so it's before the OldKill. But some Target hooks for + // createPHIDestinationCopy() may modify the default insert position of + // PHICopy. + for (auto I = MBB.SkipPHIsAndLabels(MBB.begin()), E = MBB.end(); + I != E; ++I) { + if (I == PHICopy) + break; + + if (I == OldKill) { + IsPHICopyAfterOldKill = true; + break; + } } + } + + // When we are reusing the incoming register and it has been marked killed + // by OldKill, if the PHICopy is after the OldKill, we should remove the + // killed flag from OldKill. + if (IsPHICopyAfterOldKill) { + LLVM_DEBUG(dbgs() << "Remove old kill from " << *OldKill); + LV->removeVirtualRegisterKilled(IncomingReg, *OldKill); + LLVM_DEBUG(MBB.dump()); + } - // Add information to LiveVariables to know that the incoming value is - // killed. Note that because the value is defined in several places (once - // each for each incoming block), the "def" block and instruction fields - // for the VarInfo is not filled in. - LV->addVirtualRegisterKilled(IncomingReg, *PHICopy); + // Add information to LiveVariables to know that the first used incoming + // value or the resued incoming value whose PHICopy is after the OldKIll + // is killed. Note that because the value is defined in several places + // (once each for each incoming block), the "def" block and instruction + // fields for the VarInfo is not filled in. + if (!OldKill || IsPHICopyAfterOldKill) + LV->addVirtualRegisterKilled(IncomingReg, *PHICopy); } // Since we are going to be deleting the PHI node, if it is the last use of diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 10da2d4217977..c3608f04c6480 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -154,7 +154,7 @@ static bool getUnderlyingObjectsForInstr(const MachineInstr *MI, Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias)); } else if (const Value *V = MMO->getValue()) { SmallVector Objs; - if (!getUnderlyingObjectsForCodeGen(V, Objs, DL)) + if (!getUnderlyingObjectsForCodeGen(V, Objs)) return false; for (Value *V : Objs) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a026d39600264..b22d978d37366 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1805,6 +1805,10 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { if (OptLevel == CodeGenOpt::None) return SDValue(); + // Don't simplify the token factor if the node itself has too many operands. + if (N->getNumOperands() > TokenFactorInlineLimit) + return SDValue(); + // If the sole user is a token factor, we should make sure we have a // chance to merge them together. This prevents TF chains from inhibiting // optimizations. @@ -11125,6 +11129,22 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { return SDValue(N, 0); // Return N so it doesn't get rechecked! } + // fold (sext_inreg (masked_load x)) -> (sext_masked_load x) + // ignore it if the masked load is already sign extended + if (MaskedLoadSDNode *Ld = dyn_cast(N0)) { + if (ExtVT == Ld->getMemoryVT() && N0.hasOneUse() && + Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD && + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) { + SDValue ExtMaskedLoad = DAG.getMaskedLoad( + VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), + Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(), + Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad()); + CombineTo(N, ExtMaskedLoad); + CombineTo(N0.getNode(), ExtMaskedLoad, ExtMaskedLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), @@ -19190,7 +19210,10 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // The binop must be a vector type, so we can extract some fraction of it. EVT WideBVT = BinOp.getValueType(); - if (!WideBVT.isVector()) + // The optimisations below currently assume we are dealing with fixed length + // vectors. It is possible to add support for scalable vectors, but at the + // moment we've done no analysis to prove whether they are profitable or not. + if (!WideBVT.isFixedLengthVector()) return SDValue(); EVT VT = Extract->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index 5cf83cff3a903..fd821ed5c9692 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -360,7 +360,7 @@ void FunctionLoweringInfo::clear() { RegFixups.clear(); RegsWithFixups.clear(); StatepointStackSlots.clear(); - StatepointSpillMaps.clear(); + StatepointRelocationMaps.clear(); PreferredExtendType.clear(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 0e4e99214aa24..ff84fdd62075c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -82,6 +82,28 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses, return N; } +/// Return starting index of GC operand list. +// FIXME: need a better place for this. Put it in StackMaps? +static unsigned getStatepointGCArgStartIdx(MachineInstr *MI) { + assert(MI->getOpcode() == TargetOpcode::STATEPOINT && + "STATEPOINT node expected"); + unsigned OperIdx = StatepointOpers(MI).getNumDeoptArgsIdx(); + unsigned NumDeopts = MI->getOperand(OperIdx).getImm(); + // At this point stack references has not been lowered yet, so they + // take single operand. + ++OperIdx; + while (NumDeopts--) { + MachineOperand &MO = MI->getOperand(OperIdx); + if (MO.isImm() && MO.getImm() == StackMaps::ConstantOp) { + ++OperIdx; + assert(MI->getOperand(OperIdx).isImm() && + "Unexpected statepoint operand"); + } + ++OperIdx; + } + return OperIdx; +} + /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. void InstrEmitter:: @@ -200,6 +222,8 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() && II.isVariadic() && II.variadicOpsAreDefs(); unsigned NumVRegs = HasVRegVariadicDefs ? NumResults : II.getNumDefs(); + if (Node->getMachineOpcode() == TargetOpcode::STATEPOINT) + NumVRegs = NumResults; for (unsigned i = 0; i < NumVRegs; ++i) { // If the specific node value is only used by a CopyToReg and the dest reg // is a vreg in the same register class, use the CopyToReg'd destination @@ -821,6 +845,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, NumDefs = NumResults; } ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC); + } else if (Opc == TargetOpcode::STATEPOINT) { + NumDefs = NumResults; } unsigned NumImpUses = 0; @@ -970,6 +996,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef()) MIB->setPhysRegsDeadExcept(UsedRegs, *TRI); + // STATEPOINT is too 'dynamic' to have meaningful machine description. + // We have to manually tie operands. + if (Opc == TargetOpcode::STATEPOINT && NumDefs > 0) { + assert(!HasPhysRegOuts && "STATEPOINT mishandled"); + MachineInstr *MI = MIB; + unsigned Def = 0; + unsigned Use = getStatepointGCArgStartIdx(MI) + 1; + while (Def < NumDefs) { + if (MI->getOperand(Use).isReg()) + MI->tieOperands(Def++, Use); + Use += 2; + } + } + // Run post-isel target hook to adjust this instruction if needed. if (II.hasPostISelHook()) TLI->AdjustInstrPostInstrSelection(*MIB, Node); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index ce185be162bcd..87e9a9e280d5c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3398,7 +3398,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { SmallVector Halves; EVT HalfType = EVT(VT).getHalfSizedIntegerVT(*DAG.getContext()); assert(TLI.isTypeLegal(HalfType)); - if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, Node, LHS, RHS, Halves, + if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, dl, LHS, RHS, Halves, HalfType, DAG, TargetLowering::MulExpansionKind::Always)) { for (unsigned i = 0; i < 2; ++i) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 0fa6d653a8364..9e2ae6b8f16f0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -806,7 +806,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi); - void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi); @@ -862,7 +862,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); - SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N); + SDValue WidenVecRes_ScalarOp(SDNode* N); SDValue WidenVecRes_SELECT(SDNode* N); SDValue WidenVSELECTAndMask(SDNode *N); SDValue WidenVecRes_SELECT_CC(SDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1394f084c6dc6..94c91854466eb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -836,7 +836,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break; case ISD::FCOPYSIGN: SplitVecRes_FCOPYSIGN(N, Lo, Hi); break; case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break; - case ISD::SCALAR_TO_VECTOR: SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break; + case ISD::SPLAT_VECTOR: + case ISD::SCALAR_TO_VECTOR: + SplitVecRes_ScalarOp(N, Lo, Hi); + break; case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; case ISD::LOAD: SplitVecRes_LOAD(cast(N), Lo, Hi); @@ -1517,13 +1520,18 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi); } -void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, - SDValue &Hi) { +void DAGTypeLegalizer::SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, + SDValue &Hi) { EVT LoVT, HiVT; SDLoc dl(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); - Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0)); - Hi = DAG.getUNDEF(HiVT); + Lo = DAG.getNode(N->getOpcode(), dl, LoVT, N->getOperand(0)); + if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) { + Hi = DAG.getUNDEF(HiVT); + } else { + assert(N->getOpcode() == ISD::SPLAT_VECTOR && "Unexpected opcode"); + Hi = Lo; + } } void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, @@ -2194,16 +2202,27 @@ SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) { SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { // We know that the extracted result type is legal. EVT SubVT = N->getValueType(0); + + if (SubVT.isScalableVector() != + N->getOperand(0).getValueType().isScalableVector()) + report_fatal_error("Extracting fixed from scalable not implemented"); + SDValue Idx = N->getOperand(1); SDLoc dl(N); SDValue Lo, Hi; + + if (SubVT.isScalableVector() != + N->getOperand(0).getValueType().isScalableVector()) + report_fatal_error("Extracting a fixed-length vector from an illegal " + "scalable vector is not yet supported"); + GetSplitVector(N->getOperand(0), Lo, Hi); - uint64_t LoElts = Lo.getValueType().getVectorNumElements(); + uint64_t LoElts = Lo.getValueType().getVectorMinNumElements(); uint64_t IdxVal = cast(Idx)->getZExtValue(); if (IdxVal < LoElts) { - assert(IdxVal + SubVT.getVectorNumElements() <= LoElts && + assert(IdxVal + SubVT.getVectorMinNumElements() <= LoElts && "Extracted subvector crosses vector split!"); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx); } else { @@ -2733,7 +2752,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break; case ISD::LOAD: Res = WidenVecRes_LOAD(N); break; - case ISD::SCALAR_TO_VECTOR: Res = WidenVecRes_SCALAR_TO_VECTOR(N); break; + case ISD::SPLAT_VECTOR: + case ISD::SCALAR_TO_VECTOR: + Res = WidenVecRes_ScalarOp(N); + break; case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break; case ISD::VSELECT: case ISD::SELECT: Res = WidenVecRes_SELECT(N); break; @@ -3604,16 +3626,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { EVT InVT = N->getOperand(0).getValueType(); EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); - unsigned WidenNumElts = WidenVT.getVectorNumElements(); - unsigned NumInElts = InVT.getVectorNumElements(); unsigned NumOperands = N->getNumOperands(); bool InputWidened = false; // Indicates we need to widen the input. if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) { - if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) { + unsigned WidenNumElts = WidenVT.getVectorMinNumElements(); + unsigned NumInElts = InVT.getVectorMinNumElements(); + if (WidenNumElts % NumInElts == 0) { // Add undef vectors to widen to correct length. - unsigned NumConcat = WidenVT.getVectorNumElements() / - InVT.getVectorNumElements(); + unsigned NumConcat = WidenNumElts / NumInElts; SDValue UndefVal = DAG.getUNDEF(InVT); SmallVector Ops(NumConcat); for (unsigned i=0; i < NumOperands; ++i) @@ -3637,6 +3658,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { return GetWidenedVector(N->getOperand(0)); if (NumOperands == 2) { + assert(!WidenVT.isScalableVector() && + "Cannot use vector shuffles to widen CONCAT_VECTOR result"); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + unsigned NumInElts = InVT.getVectorNumElements(); + // Replace concat of two operands with a shuffle. SmallVector MaskOps(WidenNumElts, -1); for (unsigned i = 0; i < NumInElts; ++i) { @@ -3651,6 +3677,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) { } } + assert(!WidenVT.isScalableVector() && + "Cannot use build vectors to widen CONCAT_VECTOR result"); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + unsigned NumInElts = InVT.getVectorNumElements(); + // Fall back to use extracts and build vector. EVT EltVT = WidenVT.getVectorElementType(); SmallVector Ops(WidenNumElts); @@ -3815,10 +3846,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) { return Res; } -SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) { +SDValue DAGTypeLegalizer::WidenVecRes_ScalarOp(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), - WidenVT, N->getOperand(0)); + return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, N->getOperand(0)); } // Return true is this is a SETCC node or a strict version of it. diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 17c68f2bf73b3..76fb0340a7a88 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -125,8 +125,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, PhysReg = Reg; } else if (Def->isMachineOpcode()) { const MCInstrDesc &II = TII->get(Def->getMachineOpcode()); - if (ResNo >= II.getNumDefs() && - II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) + if (ResNo >= II.getNumDefs() && II.hasImplicitDefOfPhysReg(Reg)) PhysReg = Reg; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 592c09c10fb08..7fdf8a82bae85 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2323,6 +2323,10 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, } break; } + case ISD::TRUNCATE: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return isSplatValue(V.getOperand(0), DemandedElts, UndefElts); } // We don't support other cases than those above for scalable vectors at @@ -6390,7 +6394,7 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI, unsigned AS) { // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all // pointer operands can be losslessly bitcasted to pointers of address space 0 - if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) { + if (AS != 0 && !TLI->getTargetMachine().isNoopAddrSpaceCast(AS, 0)) { report_fatal_error("cannot lower memory intrinsic in address space " + Twine(AS)); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 9d2f64b94df16..9c1517ea74140 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3005,20 +3005,6 @@ void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) { DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot())); } -void SelectionDAGBuilder::visitFSub(const User &I) { - // -0.0 - X --> fneg - Type *Ty = I.getType(); - if (isa(I.getOperand(0)) && - I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) { - SDValue Op2 = getValue(I.getOperand(1)); - setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(), - Op2.getValueType(), Op2)); - return; - } - - visitBinary(I, ISD::FSUB); -} - void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) { SDNodeFlags Flags; @@ -3425,7 +3411,7 @@ void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) { unsigned SrcAS = SV->getType()->getPointerAddressSpace(); unsigned DestAS = I.getType()->getPointerAddressSpace(); - if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) + if (!TM.isNoopAddrSpaceCast(SrcAS, DestAS)) N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS); setValue(&I, N); @@ -3753,8 +3739,6 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { SDValue N = getValue(Op0); SDLoc dl = getCurSDLoc(); auto &TLI = DAG.getTargetLoweringInfo(); - MVT PtrTy = TLI.getPointerTy(DAG.getDataLayout(), AS); - MVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout(), AS); // Normalize Vector GEP - all scalar operands should be converted to the // splat vector. @@ -3880,6 +3864,13 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { } } + MVT PtrTy = TLI.getPointerTy(DAG.getDataLayout(), AS); + MVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout(), AS); + if (IsVectorGEP) { + PtrTy = MVT::getVectorVT(PtrTy, VectorElementCount); + PtrMemTy = MVT::getVectorVT(PtrMemTy, VectorElementCount); + } + if (PtrMemTy != PtrTy && !cast(I).isInBounds()) N = DAG.getPtrExtendInReg(N, dl, PtrMemTy); @@ -6631,7 +6622,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, cast(I.getArgOperand(0))->getSExtValue(); Value *const ObjectPtr = I.getArgOperand(1); SmallVector Allocas; - GetUnderlyingObjects(ObjectPtr, Allocas, *DL); + getUnderlyingObjects(ObjectPtr, Allocas); for (SmallVectorImpl::iterator Object = Allocas.begin(), E = Allocas.end(); Object != E; ++Object) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index f0b7fb0d52299..7bad055198140 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -692,7 +692,7 @@ class SelectionDAGBuilder { void visitAdd(const User &I) { visitBinary(I, ISD::ADD); } void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); } void visitSub(const User &I) { visitBinary(I, ISD::SUB); } - void visitFSub(const User &I); + void visitFSub(const User &I) { visitBinary(I, ISD::FSUB); } void visitMul(const User &I) { visitBinary(I, ISD::MUL); } void visitFMul(const User &I) { visitBinary(I, ISD::FMUL); } void visitURem(const User &I) { visitBinary(I, ISD::UREM); } diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 2cb57c1d1ccc8..f5d29c11790c3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -67,6 +67,12 @@ cl::opt UseRegistersForDeoptValues( "use-registers-for-deopt-values", cl::Hidden, cl::init(false), cl::desc("Allow using registers for non pointer deopt args")); +cl::opt MaxRegistersForGCPointers( + "max-registers-for-gc-values", cl::Hidden, cl::init(0), + cl::desc("Max number of VRegs allowed to pass GC pointer meta args in")); + +typedef FunctionLoweringInfo::StatepointRelocationRecord RecordType; + static void pushStackMapConstant(SmallVectorImpl& Ops, SelectionDAGBuilder &Builder, uint64_t Value) { SDLoc L = Builder.getCurSDLoc(); @@ -156,14 +162,18 @@ static Optional findPreviousSpillSlot(const Value *Val, // Spill location is known for gc relocates if (const auto *Relocate = dyn_cast(Val)) { - const auto &SpillMap = - Builder.FuncInfo.StatepointSpillMaps[Relocate->getStatepoint()]; + const auto &RelocationMap = + Builder.FuncInfo.StatepointRelocationMaps[Relocate->getStatepoint()]; + + auto It = RelocationMap.find(Relocate->getDerivedPtr()); + if (It == RelocationMap.end()) + return None; - auto It = SpillMap.find(Relocate->getDerivedPtr()); - if (It == SpillMap.end()) + auto &Record = It->second; + if (Record.type != RecordType::Spill) return None; - return It->second; + return Record.payload.FI; } // Look through bitcast instructions. @@ -221,7 +231,6 @@ static Optional findPreviousSpillSlot(const Value *Val, return None; } - /// Return true if-and-only-if the given SDValue can be lowered as either a /// constant argument or a stack reference. The key point is that the value /// doesn't need to be spilled or tracked as a vreg use. @@ -242,7 +251,6 @@ static bool willLowerDirectly(SDValue Incoming) { Incoming.isUndef()); } - /// Try to find existing copies of the incoming values in stack slots used for /// statepoint spilling. If we can find a spill slot for the incoming value, /// mark that slot as allocated, and reuse the same slot for this safepoint. @@ -388,7 +396,7 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain, StoreMMO); MMO = getMachineMemOperand(MF, *cast(Loc)); - + Builder.StatepointLowering.setLocation(Incoming, Loc); } @@ -485,7 +493,9 @@ lowerIncomingStatepointValue(SDValue Incoming, bool RequireSpillSlot, /// will be set to the last value spilled (if any were). static void lowerStatepointMetaArgs(SmallVectorImpl &Ops, - SmallVectorImpl &MemRefs, SelectionDAGBuilder::StatepointLoweringInfo &SI, + SmallVectorImpl &MemRefs, + DenseMap &LowerAsVReg, + SelectionDAGBuilder::StatepointLoweringInfo &SI, SelectionDAGBuilder &Builder) { // Lower the deopt and gc arguments for this statepoint. Layout will be: // deopt argument length, deopt arguments.., gc arguments... @@ -531,6 +541,27 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, const bool LiveInDeopt = SI.StatepointFlags & (uint64_t)StatepointFlags::DeoptLiveIn; + // Decide which deriver pointers will go on VRegs + const unsigned MaxTiedRegs = 15; // Max number of tied regs MI can have. + unsigned MaxVRegPtrs = + std::min(MaxTiedRegs, MaxRegistersForGCPointers.getValue()); + + LLVM_DEBUG(dbgs() << "Desiding how to lower GC Pointers:\n"); + unsigned CurNumVRegs = 0; + for (const Value *P : SI.Ptrs) { + if (LowerAsVReg.size() == MaxVRegPtrs) + break; + SDValue PtrSD = Builder.getValue(P); + if (willLowerDirectly(PtrSD) || P->getType()->isVectorTy()) { + LLVM_DEBUG(dbgs() << "direct/spill "; PtrSD.dump(&Builder.DAG)); + continue; + } + LLVM_DEBUG(dbgs() << "vreg "; PtrSD.dump(&Builder.DAG)); + LowerAsVReg[PtrSD] = CurNumVRegs++; + } + LLVM_DEBUG(dbgs() << LowerAsVReg.size() + << " derived pointers will go in vregs\n"); + auto isGCValue = [&](const Value *V) { auto *Ty = V->getType(); if (!Ty->isPtrOrPtrVectorTy()) @@ -542,7 +573,9 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, }; auto requireSpillSlot = [&](const Value *V) { - return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V); + if (isGCValue(V)) + return !LowerAsVReg.count(Builder.getValue(V)); + return !(LiveInDeopt || UseRegistersForDeoptValues); }; // Before we actually start lowering (and allocating spill slots for values), @@ -554,9 +587,14 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, if (requireSpillSlot(V)) reservePreviousStackSlotForValue(V, Builder); } + for (unsigned i = 0; i < SI.Bases.size(); ++i) { - reservePreviousStackSlotForValue(SI.Bases[i], Builder); - reservePreviousStackSlotForValue(SI.Ptrs[i], Builder); + SDValue SDV = Builder.getValue(SI.Bases[i]); + if (!LowerAsVReg.count(SDV)) + reservePreviousStackSlotForValue(SI.Bases[i], Builder); + SDV = Builder.getValue(SI.Ptrs[i]); + if (!LowerAsVReg.count(SDV)) + reservePreviousStackSlotForValue(SI.Ptrs[i], Builder); } // First, prefix the list with the number of unique values to be @@ -567,6 +605,7 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, // The vm state arguments are lowered in an opaque manner. We do not know // what type of values are contained within. + LLVM_DEBUG(dbgs() << "Lowering deopt state\n"); for (const Value *V : SI.DeoptState) { SDValue Incoming; // If this is a function argument at a static frame index, generate it as @@ -578,6 +617,8 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, } if (!Incoming.getNode()) Incoming = Builder.getValue(V); + LLVM_DEBUG(dbgs() << "Value " << *V + << " requireSpillSlot = " << requireSpillSlot(V) << "\n"); lowerIncomingStatepointValue(Incoming, requireSpillSlot(V), Ops, MemRefs, Builder); } @@ -588,14 +629,15 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, // it's (lowered) derived pointer. i.e // (base[0], ptr[0], base[1], ptr[1], ...) for (unsigned i = 0; i < SI.Bases.size(); ++i) { - const Value *Base = SI.Bases[i]; - lowerIncomingStatepointValue(Builder.getValue(Base), - /*RequireSpillSlot*/ true, Ops, MemRefs, + bool RequireSpillSlot; + SDValue Base = Builder.getValue(SI.Bases[i]); + RequireSpillSlot = !LowerAsVReg.count(Base); + lowerIncomingStatepointValue(Base, RequireSpillSlot, Ops, MemRefs, Builder); - const Value *Ptr = SI.Ptrs[i]; - lowerIncomingStatepointValue(Builder.getValue(Ptr), - /*RequireSpillSlot*/ true, Ops, MemRefs, + SDValue Derived = Builder.getValue(SI.Ptrs[i]); + RequireSpillSlot = !LowerAsVReg.count(Derived); + lowerIncomingStatepointValue(Derived, RequireSpillSlot, Ops, MemRefs, Builder); } @@ -618,39 +660,6 @@ lowerStatepointMetaArgs(SmallVectorImpl &Ops, MemRefs.push_back(MMO); } } - - // Record computed locations for all lowered values. - // This can not be embedded in lowering loops as we need to record *all* - // values, while previous loops account only values with unique SDValues. - const Instruction *StatepointInstr = SI.StatepointInstr; - auto &SpillMap = Builder.FuncInfo.StatepointSpillMaps[StatepointInstr]; - - for (const GCRelocateInst *Relocate : SI.GCRelocates) { - const Value *V = Relocate->getDerivedPtr(); - SDValue SDV = Builder.getValue(V); - SDValue Loc = Builder.StatepointLowering.getLocation(SDV); - - if (Loc.getNode()) { - SpillMap[V] = cast(Loc)->getIndex(); - } else { - // Record value as visited, but not spilled. This is case for allocas - // and constants. For this values we can avoid emitting spill load while - // visiting corresponding gc_relocate. - // Actually we do not need to record them in this map at all. - // We do this only to check that we are not relocating any unvisited - // value. - SpillMap[V] = None; - - // Default llvm mechanisms for exporting values which are used in - // different basic blocks does not work for gc relocates. - // Note that it would be incorrect to teach llvm that all relocates are - // uses of the corresponding values so that it would automatically - // export them. Relocates of the spilled values does not use original - // value. - if (Relocate->getParent() != StatepointInstr->getParent()) - Builder.ExportFromCurrentBlock(V); - } - } } SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( @@ -665,6 +674,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( assert(SI.Bases.size() == SI.Ptrs.size() && SI.Ptrs.size() <= SI.GCRelocates.size()); + LLVM_DEBUG(dbgs() << "Lowering statepoint " << *SI.StatepointInstr << "\n"); #ifndef NDEBUG for (auto *Reloc : SI.GCRelocates) if (Reloc->getParent() == SI.StatepointInstr->getParent()) @@ -674,7 +684,9 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( // Lower statepoint vmstate and gcstate arguments SmallVector LoweredMetaArgs; SmallVector MemRefs; - lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, SI, *this); + // Maps derived pointer SDValue to statepoint result of relocated pointer. + DenseMap LowerAsVReg; + lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, LowerAsVReg, SI, *this); // Now that we've emitted the spills, we need to update the root so that the // call sequence is ordered correctly. @@ -788,12 +800,81 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( // Compute return values. Provide a glue output since we consume one as // input. This allows someone else to chain off us as needed. - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SmallVector NodeTys; + for (auto &Ptr : SI.Ptrs) { + SDValue SD = getValue(Ptr); + if (!LowerAsVReg.count(SD)) + continue; + NodeTys.push_back(SD.getValueType()); + } + LLVM_DEBUG(dbgs() << "Statepoint has " << NodeTys.size() << " results\n"); + assert(NodeTys.size() == LowerAsVReg.size() && "Inconsistent GC Ptr lowering"); + NodeTys.push_back(MVT::Other); + NodeTys.push_back(MVT::Glue); + unsigned NumResults = NodeTys.size(); MachineSDNode *StatepointMCNode = DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops); DAG.setNodeMemRefs(StatepointMCNode, MemRefs); + + // For values lowered to tied-defs, create the virtual registers. Note that + // for simplicity, we *always* create a vreg even within a single block. + DenseMap VirtRegs; + for (const auto *Relocate : SI.GCRelocates) { + Value *Derived = Relocate->getDerivedPtr(); + SDValue SD = getValue(Derived); + if (!LowerAsVReg.count(SD)) + continue; + + // Handle multiple gc.relocates of the same input efficiently. + if (VirtRegs.count(Derived)) + continue; + + SDValue Relocated = SDValue(StatepointMCNode, LowerAsVReg[SD]); + + auto *RetTy = Relocate->getType(); + Register Reg = FuncInfo.CreateRegs(RetTy); + RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), + DAG.getDataLayout(), Reg, RetTy, None); + SDValue Chain = DAG.getEntryNode(); + RFV.getCopyToRegs(Relocated, DAG, getCurSDLoc(), Chain, nullptr); + PendingExports.push_back(Chain); + + VirtRegs[Derived] = Reg; + } + + // Record for later use how each relocation was lowered. This is needed to + // allow later gc.relocates to mirror the lowering chosen. + const Instruction *StatepointInstr = SI.StatepointInstr; + auto &RelocationMap = FuncInfo.StatepointRelocationMaps[StatepointInstr]; + for (const GCRelocateInst *Relocate : SI.GCRelocates) { + const Value *V = Relocate->getDerivedPtr(); + SDValue SDV = getValue(V); + SDValue Loc = StatepointLowering.getLocation(SDV); + + RecordType Record; + if (Loc.getNode()) { + Record.type = RecordType::Spill; + Record.payload.FI = cast(Loc)->getIndex(); + } else if (LowerAsVReg.count(SDV)) { + Record.type = RecordType::VReg; + assert(VirtRegs.count(V)); + Record.payload.Reg = VirtRegs[V]; + } else { + Record.type = RecordType::NoRelocate; + // If we didn't relocate a value, we'll essentialy end up inserting an + // additional use of the original value when lowering the gc.relocate. + // We need to make sure the value is available at the new use, which + // might be in another block. + if (Relocate->getParent() != StatepointInstr->getParent()) + ExportFromCurrentBlock(V); + } + RelocationMap[V] = Record; + } + + + SDNode *SinkNode = StatepointMCNode; // Build the GC_TRANSITION_END node if necessary. @@ -804,7 +885,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( SmallVector TEOps; // Add chain - TEOps.push_back(SDValue(StatepointMCNode, 0)); + TEOps.push_back(SDValue(StatepointMCNode, NumResults - 2)); // Add GC transition arguments for (const Value *V : SI.GCTransitionArgs) { @@ -814,7 +895,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( } // Add glue - TEOps.push_back(SDValue(StatepointMCNode, 1)); + TEOps.push_back(SDValue(StatepointMCNode, NumResults - 1)); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -825,7 +906,12 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( } // Replace original call - DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root + // Call: ch,glue = CALL ... + // Statepoint: [gc relocates],ch,glue = STATEPOINT ... + unsigned NumSinkValues = SinkNode->getNumValues(); + SDValue StatepointValues[2] = {SDValue(SinkNode, NumSinkValues - 2), + SDValue(SinkNode, NumSinkValues - 1)}; + DAG.ReplaceAllUsesWith(CallNode, StatepointValues); // Remove original call node DAG.DeleteNode(CallNode); @@ -927,7 +1013,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I, setValue(&I, ReturnValue); return; } - + // Result value will be used in a different basic block so we need to export // it now. Default exporting mechanism will not work here because statepoint // call has a different type than the actual call. It means that by default @@ -1024,6 +1110,25 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { #endif const Value *DerivedPtr = Relocate.getDerivedPtr(); + auto &RelocationMap = + FuncInfo.StatepointRelocationMaps[Relocate.getStatepoint()]; + auto SlotIt = RelocationMap.find(DerivedPtr); + assert(SlotIt != RelocationMap.end() && "Relocating not lowered gc value"); + const RecordType &Record = SlotIt->second; + + // If relocation was done via virtual register.. + if (Record.type == RecordType::VReg) { + Register InReg = Record.payload.Reg; + RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), + DAG.getDataLayout(), InReg, Relocate.getType(), + None); // This is not an ABI copy. + SDValue Chain = DAG.getEntryNode(); + SDValue Relocation = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), + Chain, nullptr, nullptr); + setValue(&Relocate, Relocation); + return; + } + SDValue SD = getValue(DerivedPtr); if (SD.isUndef() && SD.getValueType().getSizeInBits() <= 64) { @@ -1033,19 +1138,17 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { return; } - auto &SpillMap = FuncInfo.StatepointSpillMaps[Relocate.getStatepoint()]; - auto SlotIt = SpillMap.find(DerivedPtr); - assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value"); - Optional DerivedPtrLocation = SlotIt->second; // We didn't need to spill these special cases (constants and allocas). // See the handling in spillIncomingValueForStatepoint for detail. - if (!DerivedPtrLocation) { + if (Record.type == RecordType::NoRelocate) { setValue(&Relocate, SD); return; } - unsigned Index = *DerivedPtrLocation; + assert(Record.type == RecordType::Spill); + + unsigned Index = Record.payload.FI;; SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy()); // All the reloads are independent and are reading memory only modified by diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index d140b15067a6e..c7829cdd9cdbd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1858,6 +1858,11 @@ bool TargetLowering::SimplifyDemandedBits( assert(!Known.hasConflict() && "Bits known to be one AND zero?"); assert(Known.getBitWidth() == InBits && "Src width has changed?"); Known = Known.zext(BitWidth); + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (SDValue NewSrc = SimplifyMultipleUseDemandedBits( + Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc)); break; } case ISD::SIGN_EXTEND: @@ -1906,6 +1911,11 @@ bool TargetLowering::SimplifyDemandedBits( if (!TLO.LegalOperations() || isOperationLegal(Opc, VT)) return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src)); } + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (SDValue NewSrc = SimplifyMultipleUseDemandedBits( + Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc)); break; } case ISD::ANY_EXTEND: @@ -4312,8 +4322,8 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const { } SDValue TargetLowering::LowerAsmOutputForConstraint( - SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo, - SelectionDAG &DAG) const { + SDValue &Chain, SDValue &Flag, const SDLoc &DL, + const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { return SDValue(); } @@ -5918,7 +5928,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, // Legalization Utilities //===----------------------------------------------------------------------===// -bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, +bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl, SDValue LHS, SDValue RHS, SmallVectorImpl &Result, EVT HiLoVT, SelectionDAG &DAG, @@ -6106,7 +6116,7 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT, SDValue LL, SDValue LH, SDValue RL, SDValue RH) const { SmallVector Result; - bool Ok = expandMUL_LOHI(N->getOpcode(), N->getValueType(0), N, + bool Ok = expandMUL_LOHI(N->getOpcode(), N->getValueType(0), SDLoc(N), N->getOperand(0), N->getOperand(1), Result, HiLoVT, DAG, Kind, LL, LH, RL, RH); if (Ok) { diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp index d720d93c306db..7258feafb7e90 100644 --- a/llvm/lib/CodeGen/StackColoring.cpp +++ b/llvm/lib/CodeGen/StackColoring.cpp @@ -1048,7 +1048,7 @@ void StackColoring::remapInstructions(DenseMap &SlotRemap) { if (MMO->getAAInfo()) { if (const Value *MMOV = MMO->getValue()) { SmallVector Objs; - getUnderlyingObjectsForCodeGen(MMOV, Objs, MF->getDataLayout()); + getUnderlyingObjectsForCodeGen(MMOV, Objs); if (Objs.empty()) MayHaveConflictingAAMD = true; diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index a343791807e64..e246c2e5f55cb 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -251,10 +251,9 @@ bool StackProtector::HasAddressTaken(const Instruction *AI, static const CallInst *findStackProtectorIntrinsic(Function &F) { for (const BasicBlock &BB : F) for (const Instruction &I : BB) - if (const CallInst *CI = dyn_cast(&I)) - if (CI->getCalledFunction() == - Intrinsic::getDeclaration(F.getParent(), Intrinsic::stackprotector)) - return CI; + if (const auto *II = dyn_cast(&I)) + if (II->getIntrinsicID() == Intrinsic::stackprotector) + return II; return nullptr; } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 2c94c2c62e5f0..4562e1e018c0b 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -801,6 +801,11 @@ bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const { } } +bool TargetLoweringBase::isFreeAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return TM.isNoopAddrSpaceCast(SrcAS, DestAS); +} + void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) { // If the command-line option was specified, ignore this request. if (!JumpIsExpensiveOverride.getNumOccurrences()) @@ -1041,9 +1046,19 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, // Inherit previous memory operands. MIB.cloneMemRefs(*MI); - for (auto &MO : MI->operands()) { + for (unsigned i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand &MO = MI->getOperand(i); if (!MO.isFI()) { + // Index of Def operand this Use it tied to. + // Since Defs are coming before Uses, if Use is tied, then + // index of Def must be smaller that index of that Use. + // Also, Defs preserve their position in new MI. + unsigned TiedTo = i; + if (MO.isReg() && MO.isTied()) + TiedTo = MI->findTiedOperandIdx(i); MIB.add(MO); + if (TiedTo < i) + MIB->tieOperands(TiedTo, MIB->getNumOperands() - 1); continue; } diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index bf3b89e006405..8ef91250423f3 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -680,7 +680,7 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( // MD_associated in a unique section. unsigned UniqueID = MCContext::GenericSectionID; const MCSymbolELF *LinkedToSym = getLinkedToSymbol(GO, TM); - if (LinkedToSym) { + if (GO->getMetadata(LLVMContext::MD_associated)) { UniqueID = NextUniqueID++; Flags |= ELF::SHF_LINK_ORDER; } else { @@ -2027,8 +2027,8 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForExternalReference( MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { - assert(!TM.getFunctionSections() && !TM.getDataSections() && - "XCOFF unique sections not yet implemented."); + assert(!TM.getDataSections() && + "XCOFF unique data sections not yet implemented."); // Common symbols go into a csect with matching name which will get mapped // into the .bss section. @@ -2057,8 +2057,13 @@ MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal( Kind, /* BeginSymbolName */ nullptr); } - if (Kind.isText()) + if (Kind.isText()) { + if (TM.getFunctionSections()) { + return cast(getFunctionEntryPointSymbol(GO, TM)) + ->getRepresentedCsect(); + } return TextSection; + } if (Kind.isData() || Kind.isReadOnlyWithRel()) // TODO: We may put this under option control, because user may want to @@ -2161,6 +2166,22 @@ MCSymbol *TargetLoweringObjectFileXCOFF::getFunctionEntryPointSymbol( SmallString<128> NameStr; NameStr.push_back('.'); getNameWithPrefix(NameStr, Func, TM); + + // When -function-sections is enabled, it's not necessary to emit + // function entry point label any more. We will use function entry + // point csect instead. For function delcarations, it's okay to continue + // using label semantic because undefined symbols gets treated as csect with + // XTY_ER property anyway. + if (TM.getFunctionSections() && !Func->isDeclaration() && + isa(Func)) { + XCOFF::StorageClass SC = + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(Func); + return cast(getContext().getXCOFFSection( + NameStr, XCOFF::XMC_PR, XCOFF::XTY_SD, SC, + SectionKind::getText())) + ->getQualNameSymbol(); + } + return getContext().getOrCreateSymbol(NameStr); } diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp index 1d76a49939dc0..7c60d51677188 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp @@ -24,7 +24,10 @@ JITLinkerBase::~JITLinkerBase() {} void JITLinkerBase::linkPhase1(std::unique_ptr Self) { - LLVM_DEBUG({ dbgs() << "Building jitlink graph for new input...\n"; }); + LLVM_DEBUG({ + dbgs() << "Building jitlink graph for new input " + << Ctx->getObjectBuffer().getBufferIdentifier() << "...\n"; + }); // Build the link graph. if (auto GraphOrErr = buildGraph(Ctx->getObjectBuffer())) @@ -67,7 +70,9 @@ void JITLinkerBase::linkPhase1(std::unique_ptr Self) { // Notify client that the defined symbols have been assigned addresses. LLVM_DEBUG( { dbgs() << "Resolving symbols defined in " << G->getName() << "\n"; }); - Ctx->notifyResolved(*G); + + if (auto Err = Ctx->notifyResolved(*G)) + return Ctx->notifyFailed(std::move(Err)); auto ExternalSymbols = getExternalSymbolNames(); @@ -332,12 +337,6 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) { dbgs() << " " << Sym->getName() << ": " << formatv("{0:x16}", Sym->getAddress()) << "\n"; }); - assert(llvm::all_of(G->external_symbols(), - [](Symbol *Sym) { - return Sym->getAddress() != 0 || - Sym->getLinkage() == Linkage::Weak; - }) && - "All strong external symbols should have been resolved by now"); } void JITLinkerBase::copyBlockContentToWorkingMemory( @@ -445,16 +444,19 @@ void prune(LinkGraph &G) { VisitedBlocks.insert(&B); for (auto &E : Sym->getBlock().edges()) { - if (E.getTarget().isDefined() && !E.getTarget().isLive()) { - E.getTarget().setLive(true); + // If the edge target is a defined symbol that is being newly marked live + // then add it to the worklist. + if (E.getTarget().isDefined() && !E.getTarget().isLive()) Worklist.push_back(&E.getTarget()); - } + + // Mark the target live. + E.getTarget().setLive(true); } } - // Collect all the symbols to remove, then remove them. + // Collect all defined symbols to remove, then remove them. { - LLVM_DEBUG(dbgs() << "Dead-stripping symbols:\n"); + LLVM_DEBUG(dbgs() << "Dead-stripping defined symbols:\n"); std::vector SymbolsToRemove; for (auto *Sym : G.defined_symbols()) if (!Sym->isLive()) @@ -477,6 +479,19 @@ void prune(LinkGraph &G) { G.removeBlock(*B); } } + + // Collect all external symbols to remove, then remove them. + { + LLVM_DEBUG(dbgs() << "Removing unused external symbols:\n"); + std::vector SymbolsToRemove; + for (auto *Sym : G.external_symbols()) + if (!Sym->isLive()) + SymbolsToRemove.push_back(Sym); + for (auto *Sym : SymbolsToRemove) { + LLVM_DEBUG(dbgs() << " " << *Sym << "...\n"); + G.removeExternalSymbol(*Sym); + } + } } } // end namespace jitlink diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index 4d255cd66c1be..278f492f0ebe7 100644 --- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -322,7 +322,8 @@ StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName, auto ObjTT = Obj.getTriple(); if (ObjTT.getArch() == TT.getArch() && ObjTT.getSubArch() == TT.getSubArch() && - ObjTT.getVendor() == TT.getVendor()) { + (TT.getVendor() == Triple::UnknownVendor || + ObjTT.getVendor() == TT.getVendor())) { // We found a match. Create an instance from a buffer covering this // slice. auto SliceBuffer = MemoryBuffer::getFileSlice(FileName, Obj.getSize(), diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp index 8ad2e338ca82a..5b828ed84462a 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp @@ -97,7 +97,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { }); } - void notifyResolved(LinkGraph &G) override { + Error notifyResolved(LinkGraph &G) override { auto &ES = Layer.getExecutionSession(); SymbolFlagsMap ExtraSymbolsToClaim; @@ -143,7 +143,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { if (!ExtraSymbolsToClaim.empty()) if (auto Err = MR.defineMaterializing(ExtraSymbolsToClaim)) - return notifyFailed(std::move(Err)); + return Err; { @@ -169,12 +169,9 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } // If there were missing symbols then report the error. - if (!MissingSymbols.empty()) { - ES.reportError(make_error( - G.getName(), std::move(MissingSymbols))); - MR.failMaterialization(); - return; - } + if (!MissingSymbols.empty()) + return make_error(G.getName(), + std::move(MissingSymbols)); // If there are more definitions than expected, add them to the // ExtraSymbols vector. @@ -186,20 +183,16 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } // If there were extra definitions then report the error. - if (!ExtraSymbols.empty()) { - ES.reportError(make_error( - G.getName(), std::move(ExtraSymbols))); - MR.failMaterialization(); - return; - } + if (!ExtraSymbols.empty()) + return make_error(G.getName(), + std::move(ExtraSymbols)); } - if (auto Err = MR.notifyResolved(InternedResult)) { - Layer.getExecutionSession().reportError(std::move(Err)); - MR.failMaterialization(); - return; - } + if (auto Err = MR.notifyResolved(InternedResult)) + return Err; + Layer.notifyLoaded(MR); + return Error::success(); } void notifyFinalized( diff --git a/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp b/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp index ea8bde971d1d4..18de5b616eec8 100644 --- a/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TPCDynamicLibrarySearchGenerator.cpp @@ -14,7 +14,7 @@ namespace orc { Expected> TPCDynamicLibrarySearchGenerator::Load(TargetProcessControl &TPC, const char *LibraryPath) { - auto Handle = TPC.loadLibrary(LibraryPath); + auto Handle = TPC.loadDylib(LibraryPath); if (!Handle) return Handle.takeError(); @@ -30,7 +30,7 @@ Error TPCDynamicLibrarySearchGenerator::tryToGenerate( SymbolMap NewSymbols; - TargetProcessControl::LookupRequestElement Request(DylibHandle, Symbols); + TargetProcessControl::LookupRequestElement Request(H, Symbols); auto Result = TPC.lookupSymbols(Request); if (!Result) return Result.takeError(); diff --git a/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp index b4c21c32310ce..85dc4bec9a122 100644 --- a/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TPCIndirectionUtils.cpp @@ -297,15 +297,14 @@ TPCIndirectionUtils::writeResolverBlock(JITTargetAddress ReentryFnAddr, return Alloc.takeError(); auto WorkingMemory = (*Alloc)->getWorkingMemory(ResolverBlockPermissions); - auto TargetAddress = (*Alloc)->getTargetMemory(ResolverBlockPermissions); - ABI->writeResolverCode(WorkingMemory.data(), TargetAddress, ReentryFnAddr, + ResolverBlockAddr = (*Alloc)->getTargetMemory(ResolverBlockPermissions); + ABI->writeResolverCode(WorkingMemory.data(), ResolverBlockAddr, ReentryFnAddr, ReentryCtxAddr); if (auto Err = (*Alloc)->finalize()) return std::move(Err); ResolverBlock = std::move(*Alloc); - ResolverBlockAddr = ResolverBlock->getTargetMemory(ResolverBlockPermissions); return ResolverBlockAddr; } diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp index ab07d3ad90b11..f17f5bf32856f 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcessControl.cpp @@ -19,13 +19,12 @@ namespace orc { TargetProcessControl::MemoryAccess::~MemoryAccess() {} -TargetProcessControl::TargetProcessControl(Triple TT, unsigned PageSize) - : TT(std::move(TT)), PageSize(PageSize) {} - TargetProcessControl::~TargetProcessControl() {} -SelfTargetProcessControl::SelfTargetProcessControl(Triple TT, unsigned PageSize) - : TargetProcessControl(std::move(TT), PageSize) { +SelfTargetProcessControl::SelfTargetProcessControl(Triple TT, + unsigned PageSize) { + this->TT = std::move(TT); + this->PageSize = PageSize; this->MemMgr = IPMM.get(); this->MemAccess = this; if (this->TT.isOSBinFormatMachO()) @@ -43,11 +42,11 @@ SelfTargetProcessControl::Create() { return std::make_unique(std::move(TT), *PageSize); } -Expected -SelfTargetProcessControl::loadLibrary(const char *LibraryPath) { +Expected +SelfTargetProcessControl::loadDylib(const char *DylibPath) { std::string ErrMsg; auto Dylib = std::make_unique( - sys::DynamicLibrary::getPermanentLibrary(LibraryPath, &ErrMsg)); + sys::DynamicLibrary::getPermanentLibrary(DylibPath, &ErrMsg)); if (!Dylib->isValid()) return make_error(std::move(ErrMsg), inconvertibleErrorCode()); DynamicLibraries.push_back(std::move(Dylib)); diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 7c39ddc8b1da0..7ed8a718ed3c1 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -269,7 +269,7 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section, uint64_t SymOffset) { switch (Type) { default: - llvm_unreachable("Relocation type not implemented yet!"); + report_fatal_error("Relocation type not implemented yet!"); break; case ELF::R_X86_64_NONE: break; @@ -359,7 +359,7 @@ void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section, default: // There are other relocation types, but it appears these are the // only ones currently used by the LLVM ELF object writer - llvm_unreachable("Relocation type not implemented yet!"); + report_fatal_error("Relocation type not implemented yet!"); break; } } @@ -382,7 +382,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section, switch (Type) { default: - llvm_unreachable("Relocation type not implemented yet!"); + report_fatal_error("Relocation type not implemented yet!"); break; case ELF::R_AARCH64_ABS16: { uint64_t Result = Value + Addend; @@ -721,7 +721,7 @@ void RuntimeDyldELF::resolvePPC32Relocation(const SectionEntry &Section, uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); switch (Type) { default: - llvm_unreachable("Relocation type not implemented yet!"); + report_fatal_error("Relocation type not implemented yet!"); break; case ELF::R_PPC_ADDR16_LO: writeInt16BE(LocalAddress, applyPPClo(Value + Addend)); @@ -741,7 +741,7 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); switch (Type) { default: - llvm_unreachable("Relocation type not implemented yet!"); + report_fatal_error("Relocation type not implemented yet!"); break; case ELF::R_PPC64_ADDR16: writeInt16BE(LocalAddress, applyPPClo(Value + Addend)); @@ -835,7 +835,7 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section, uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); switch (Type) { default: - llvm_unreachable("Relocation type not implemented yet!"); + report_fatal_error("Relocation type not implemented yet!"); break; case ELF::R_390_PC16DBL: case ELF::R_390_PLT16DBL: { @@ -890,7 +890,7 @@ void RuntimeDyldELF::resolveBPFRelocation(const SectionEntry &Section, switch (Type) { default: - llvm_unreachable("Relocation type not implemented yet!"); + report_fatal_error("Relocation type not implemented yet!"); break; case ELF::R_BPF_NONE: break; diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp index c44e858ab5ed5..56a6e2b08bd90 100644 --- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp @@ -175,11 +175,11 @@ static int isVariantApplicableInContextHelper( LLVM_DEBUG({ if (MK == MK_ALL) dbgs() << "[" << DEBUG_TYPE << "] Property " - << getOpenMPContextTraitPropertyName(Property) + << getOpenMPContextTraitPropertyName(Property, "") << " was not in the OpenMP context but match kind is all.\n"; if (MK == MK_NONE) dbgs() << "[" << DEBUG_TYPE << "] Property " - << getOpenMPContextTraitPropertyName(Property) + << getOpenMPContextTraitPropertyName(Property, "") << " was in the OpenMP context but match kind is none.\n"; }); return false; @@ -198,6 +198,14 @@ static int isVariantApplicableInContextHelper( continue; bool IsActiveTrait = Ctx.ActiveTraits.test(unsigned(Property)); + + // We overwrite the isa trait as it is actually up to the OMPContext hook to + // check the raw string(s). + if (Property == TraitProperty::device_isa___ANY) + IsActiveTrait = llvm::all_of(VMI.ISATraits, [&](StringRef RawString) { + return Ctx.matchesISATrait(RawString); + }); + Optional Result = HandleTrait(Property, IsActiveTrait); if (Result.hasValue()) return Result.getValue(); @@ -225,7 +233,7 @@ static int isVariantApplicableInContextHelper( if (!FoundInOrder) { LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] Construct property " - << getOpenMPContextTraitPropertyName(Property) + << getOpenMPContextTraitPropertyName(Property, "") << " was not nested properly.\n"); return false; } @@ -425,8 +433,12 @@ StringRef llvm::omp::getOpenMPContextTraitSelectorName(TraitSelector Kind) { llvm_unreachable("Unknown trait selector!"); } -TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind(TraitSet Set, - StringRef S) { +TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind( + TraitSet Set, TraitSelector Selector, StringRef S) { + // Special handling for `device={isa(...)}` as we accept anything here. It is + // up to the target to decide if the feature is available. + if (Set == TraitSet::device && Selector == TraitSelector::device_isa) + return TraitProperty::device_isa___ANY; #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str) \ if (Set == TraitSet::TraitSetEnum && Str == S) \ return TraitProperty::Enum; @@ -444,7 +456,10 @@ llvm::omp::getOpenMPContextTraitPropertyForSelector(TraitSelector Selector) { #include "llvm/Frontend/OpenMP/OMPKinds.def" .Default(TraitProperty::invalid); } -StringRef llvm::omp::getOpenMPContextTraitPropertyName(TraitProperty Kind) { +StringRef llvm::omp::getOpenMPContextTraitPropertyName(TraitProperty Kind, + StringRef RawString) { + if (Kind == TraitProperty::device_isa___ANY) + return RawString; switch (Kind) { #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str) \ case TraitProperty::Enum: \ diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 9468a3aa3c8dd..a5fe4ec87c46b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -394,9 +394,10 @@ void OpenMPIRBuilder::emitCancelationCheckImpl( } IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel( - const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, - PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, - Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) { + const LocationDescription &Loc, InsertPointTy OuterAllocaIP, + BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, + FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, + omp::ProcBindKind ProcBind, bool IsCancellable) { if (!updateToLocation(Loc)) return Loc.IP; @@ -429,7 +430,9 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel( // we want to delete at the end. SmallVector ToBeDeleted; - Builder.SetInsertPoint(OuterFn->getEntryBlock().getFirstNonPHI()); + // Change the location to the outer alloca insertion point to create and + // initialize the allocas we pass into the parallel region. + Builder.restoreIP(OuterAllocaIP); AllocaInst *TIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); AllocaInst *ZeroAddr = Builder.CreateAlloca(Int32, nullptr, "zero.addr"); @@ -481,9 +484,9 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel( // Generate the privatization allocas in the block that will become the entry // of the outlined function. - InsertPointTy AllocaIP(PRegEntryBB, - PRegEntryBB->getTerminator()->getIterator()); - Builder.restoreIP(AllocaIP); + Builder.SetInsertPoint(PRegEntryBB->getTerminator()); + InsertPointTy InnerAllocaIP = Builder.saveIP(); + AllocaInst *PrivTIDAddr = Builder.CreateAlloca(Int32, nullptr, "tid.addr.local"); Instruction *PrivTID = Builder.CreateLoad(PrivTIDAddr, "tid"); @@ -512,7 +515,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel( // Let the caller create the body. assert(BodyGenCB && "Expected body generation callback!"); InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin()); - BodyGenCB(AllocaIP, CodeGenIP, *PRegPreFiniBB); + BodyGenCB(InnerAllocaIP, CodeGenIP, *PRegPreFiniBB); LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n"); @@ -671,7 +674,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel( ReplacementValue = PrivTID; } else { Builder.restoreIP( - PrivCB(AllocaIP, Builder.saveIP(), V, ReplacementValue)); + PrivCB(InnerAllocaIP, Builder.saveIP(), V, ReplacementValue)); assert(ReplacementValue && "Expected copy/create callback to set replacement value!"); if (ReplacementValue == &V) @@ -686,6 +689,10 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel( LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n"); PrivHelper(*Input); } + LLVM_DEBUG({ + for (Value *Output : Outputs) + LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n"); + }); assert(Outputs.empty() && "OpenMP outlining should not produce live-out values!"); diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index eabaaa203927f..396c39b5b3a85 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/Support/Compiler.h" @@ -835,6 +836,54 @@ ConstantRange ConstantRange::overflowingBinaryOp(Instruction::BinaryOps BinOp, } } +bool ConstantRange::isIntrinsicSupported(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::umin: + case Intrinsic::umax: + case Intrinsic::smin: + case Intrinsic::smax: + case Intrinsic::abs: + return true; + default: + return false; + } +} + +ConstantRange ConstantRange::intrinsic(Intrinsic::ID IntrinsicID, + ArrayRef Ops) { + switch (IntrinsicID) { + case Intrinsic::uadd_sat: + return Ops[0].uadd_sat(Ops[1]); + case Intrinsic::usub_sat: + return Ops[0].usub_sat(Ops[1]); + case Intrinsic::sadd_sat: + return Ops[0].sadd_sat(Ops[1]); + case Intrinsic::ssub_sat: + return Ops[0].ssub_sat(Ops[1]); + case Intrinsic::umin: + return Ops[0].umin(Ops[1]); + case Intrinsic::umax: + return Ops[0].umax(Ops[1]); + case Intrinsic::smin: + return Ops[0].smin(Ops[1]); + case Intrinsic::smax: + return Ops[0].smax(Ops[1]); + case Intrinsic::abs: { + const APInt *IntMinIsPoison = Ops[1].getSingleElement(); + assert(IntMinIsPoison && "Must be known (immarg)"); + assert(IntMinIsPoison->getBitWidth() == 1 && "Must be boolean"); + return Ops[0].abs(IntMinIsPoison->getBoolValue()); + } + default: + assert(!isIntrinsicSupported(IntrinsicID) && "Shouldn't be supported"); + llvm_unreachable("Unsupported intrinsic"); + } +} + ConstantRange ConstantRange::add(const ConstantRange &Other) const { if (isEmptySet() || Other.isEmptySet()) @@ -1418,7 +1467,7 @@ ConstantRange ConstantRange::inverse() const { return ConstantRange(Upper, Lower); } -ConstantRange ConstantRange::abs() const { +ConstantRange ConstantRange::abs(bool IntMinIsPoison) const { if (isEmptySet()) return getEmpty(); @@ -1430,12 +1479,23 @@ ConstantRange ConstantRange::abs() const { else Lo = APIntOps::umin(Lower, -Upper + 1); - // SignedMin is included in the result range. - return ConstantRange(Lo, APInt::getSignedMinValue(getBitWidth()) + 1); + // If SignedMin is not poison, then it is included in the result range. + if (IntMinIsPoison) + return ConstantRange(Lo, APInt::getSignedMinValue(getBitWidth())); + else + return ConstantRange(Lo, APInt::getSignedMinValue(getBitWidth()) + 1); } APInt SMin = getSignedMin(), SMax = getSignedMax(); + // Skip SignedMin if it is poison. + if (IntMinIsPoison && SMin.isMinSignedValue()) { + // The range may become empty if it *only* contains SignedMin. + if (SMax.isMinSignedValue()) + return getEmpty(); + ++SMin; + } + // All non-negative. if (SMin.isNonNegative()) return *this; diff --git a/llvm/lib/IR/Dominators.cpp b/llvm/lib/IR/Dominators.cpp index bb1cc347dcb12..f0b169ea1337f 100644 --- a/llvm/lib/IR/Dominators.cpp +++ b/llvm/lib/IR/Dominators.cpp @@ -90,9 +90,10 @@ template void llvm::DomTreeBuilder::DeleteEdge( DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To); template void llvm::DomTreeBuilder::ApplyUpdates( - DomTreeBuilder::BBDomTree &DT, DomTreeBuilder::BBUpdates); + DomTreeBuilder::BBDomTree &DT, DomTreeBuilder::BBDomTreeGraphDiff &); template void llvm::DomTreeBuilder::ApplyUpdates( - DomTreeBuilder::BBPostDomTree &DT, DomTreeBuilder::BBUpdates); + DomTreeBuilder::BBPostDomTree &DT, + DomTreeBuilder::BBPostDomTreeGraphDiff &); template bool llvm::DomTreeBuilder::Verify( const DomTreeBuilder::BBDomTree &DT, diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 1fffce015f707..f223f2c6a2b3d 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -996,17 +996,22 @@ Value *IRBuilderBase::CreateStripInvariantGroup(Value *Ptr) { Value *IRBuilderBase::CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name) { - assert(NumElts > 0 && "Cannot splat to an empty vector!"); + ElementCount EC(NumElts, false); + return CreateVectorSplat(EC, V, Name); +} + +Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V, + const Twine &Name) { + assert(EC.Min > 0 && "Cannot splat to an empty vector!"); // First insert it into an undef vector so we can shuffle it. Type *I32Ty = getInt32Ty(); - Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), NumElts)); + Value *Undef = UndefValue::get(VectorType::get(V->getType(), EC)); V = CreateInsertElement(Undef, V, ConstantInt::get(I32Ty, 0), Name + ".splatinsert"); // Shuffle the value across the desired number of elements. - Value *Zeros = - ConstantAggregateZero::get(FixedVectorType::get(I32Ty, NumElts)); + Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32Ty, EC)); return CreateShuffleVector(V, Undef, Zeros, Name + ".splat"); } diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp index 03657ff8d9d43..7c73d2ab98711 100644 --- a/llvm/lib/IR/IRPrintingPasses.cpp +++ b/llvm/lib/IR/IRPrintingPasses.cpp @@ -11,13 +11,14 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/IRPrintingPasses.h" +#include "llvm/ADT/StringRef.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" -#include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" + using namespace llvm; PrintModulePass::PrintModulePass() : OS(dbgs()) {} diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index b97ac37c5fcfd..e8fdaa23761c3 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -57,27 +57,7 @@ class Type; class Value; class ValueHandleBase; -struct DenseMapAPIntKeyInfo { - static inline APInt getEmptyKey() { - APInt V(nullptr, 0); - V.U.VAL = 0; - return V; - } - - static inline APInt getTombstoneKey() { - APInt V(nullptr, 0); - V.U.VAL = 1; - return V; - } - - static unsigned getHashValue(const APInt &Key) { - return static_cast(hash_value(Key)); - } - - static bool isEqual(const APInt &LHS, const APInt &RHS) { - return LHS.getBitWidth() == RHS.getBitWidth() && LHS == RHS; - } -}; +using DenseMapAPIntKeyInfo = DenseMapInfo; struct DenseMapAPFloatKeyInfo { static inline APFloat getEmptyKey() { return APFloat(APFloat::Bogus(), 1); } diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index 74869fa62c66f..96434ae3306b3 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -1586,9 +1586,12 @@ bool FPPassManager::runOnFunction(Function &F) { #endif LocalChanged |= FP->runOnFunction(F); -#ifdef EXPENSIVE_CHECKS - assert((LocalChanged || (RefHash == StructuralHash(F))) && - "Pass modifies its input and doesn't report it."); +#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG) + if (!LocalChanged && (RefHash != StructuralHash(F))) { + llvm::errs() << "Pass modifies its input and doesn't report it: " + << FP->getPassName() << "\n"; + assert(false && "Pass modifies its input and doesn't report it."); + } #endif if (EmitICRemark) { @@ -1614,7 +1617,8 @@ bool FPPassManager::runOnFunction(Function &F) { dumpUsedSet(FP); verifyPreservedAnalysis(FP); - removeNotPreservedAnalysis(FP); + if (LocalChanged) + removeNotPreservedAnalysis(FP); recordAvailableAnalysis(FP); removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG); } @@ -1723,7 +1727,8 @@ MPPassManager::runOnModule(Module &M) { dumpUsedSet(MP); verifyPreservedAnalysis(MP); - removeNotPreservedAnalysis(MP); + if (LocalChanged) + removeNotPreservedAnalysis(MP); recordAvailableAnalysis(MP); removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG); } diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index 91612eafada73..5346323ceabba 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -163,7 +163,9 @@ bool ModuleSummaryIndex::isGUIDLive(GlobalValue::GUID GUID) const { return false; } -static void propagateAttributesToRefs(GlobalValueSummary *S) { +static void +propagateAttributesToRefs(GlobalValueSummary *S, + DenseSet &MarkedNonReadWriteOnly) { // If reference is not readonly or writeonly then referenced summary is not // read/writeonly either. Note that: // - All references from GlobalVarSummary are conservatively considered as @@ -174,6 +176,11 @@ static void propagateAttributesToRefs(GlobalValueSummary *S) { // for them. for (auto &VI : S->refs()) { assert(VI.getAccessSpecifier() == 0 || isa(S)); + if (!VI.getAccessSpecifier()) { + if (!MarkedNonReadWriteOnly.insert(VI).second) + continue; + } else if (MarkedNonReadWriteOnly.find(VI) != MarkedNonReadWriteOnly.end()) + continue; for (auto &Ref : VI.getSummaryList()) // If references to alias is not read/writeonly then aliasee // is not read/writeonly @@ -216,11 +223,24 @@ void ModuleSummaryIndex::propagateAttributes( const DenseSet &GUIDPreservedSymbols) { if (!PropagateAttrs) return; + DenseSet MarkedNonReadWriteOnly; for (auto &P : *this) for (auto &S : P.second.SummaryList) { - if (!isGlobalValueLive(S.get())) + if (!isGlobalValueLive(S.get())) { + // computeDeadSymbols should have marked all copies live. Note that + // it is possible that there is a GUID collision between internal + // symbols with the same name in different files of the same name but + // not enough distinguishing path. Because computeDeadSymbols should + // conservatively mark all copies live we can assert here that all are + // dead if any copy is dead. + assert(llvm::none_of( + P.second.SummaryList, + [&](const std::unique_ptr &Summary) { + return isGlobalValueLive(Summary.get()); + })); // We don't examine references from dead objects - continue; + break; + } // Global variable can't be marked read/writeonly if it is not eligible // to import since we need to ensure that all external references get @@ -240,7 +260,7 @@ void ModuleSummaryIndex::propagateAttributes( GVS->setReadOnly(false); GVS->setWriteOnly(false); } - propagateAttributesToRefs(S.get()); + propagateAttributesToRefs(S.get(), MarkedNonReadWriteOnly); } setWithAttributePropagation(); if (llvm::AreStatisticsEnabled()) diff --git a/llvm/lib/IR/PassInstrumentation.cpp b/llvm/lib/IR/PassInstrumentation.cpp index 49cc6ec04d90c..d3867de44bcac 100644 --- a/llvm/lib/IR/PassInstrumentation.cpp +++ b/llvm/lib/IR/PassInstrumentation.cpp @@ -12,10 +12,19 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/PassInstrumentation.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/IR/PassManager.h" namespace llvm { AnalysisKey PassInstrumentationAnalysis::Key; +bool isSpecialPass(StringRef PassID, const std::vector &Specials) { + size_t Pos = PassID.find('<'); + if (Pos == StringRef::npos) + return false; + StringRef Prefix = PassID.substr(0, Pos); + return any_of(Specials, [Prefix](StringRef S) { return Prefix.endswith(S); }); +} + } // namespace llvm diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp index 829c3e8ea9ad4..fb551ddf5c727 100644 --- a/llvm/lib/IR/PassTimingInfo.cpp +++ b/llvm/lib/IR/PassTimingInfo.cpp @@ -231,30 +231,20 @@ void TimePassesHandler::stopTimer(StringRef PassID) { MyTimer->stopTimer(); } -static bool matchPassManager(StringRef PassID) { - size_t prefix_pos = PassID.find('<'); - if (prefix_pos == StringRef::npos) - return false; - StringRef Prefix = PassID.substr(0, prefix_pos); - return Prefix.endswith("PassManager") || Prefix.endswith("PassAdaptor") || - Prefix.endswith("AnalysisManagerProxy"); -} - -bool TimePassesHandler::runBeforePass(StringRef PassID) { - if (matchPassManager(PassID)) - return true; +void TimePassesHandler::runBeforePass(StringRef PassID) { + if (isSpecialPass(PassID, + {"PassManager", "PassAdaptor", "AnalysisManagerProxy"})) + return; startTimer(PassID); LLVM_DEBUG(dbgs() << "after runBeforePass(" << PassID << ")\n"); LLVM_DEBUG(dump()); - - // we are not going to skip this pass, thus return true. - return true; } void TimePassesHandler::runAfterPass(StringRef PassID) { - if (matchPassManager(PassID)) + if (isSpecialPass(PassID, + {"PassManager", "PassAdaptor", "AnalysisManagerProxy"})) return; stopTimer(PassID); @@ -267,8 +257,8 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) { if (!Enabled) return; - PIC.registerBeforePassCallback( - [this](StringRef P, Any) { return this->runBeforePass(P); }); + PIC.registerBeforeNonSkippedPassCallback( + [this](StringRef P, Any) { this->runBeforePass(P); }); PIC.registerAfterPassCallback( [this](StringRef P, Any) { this->runAfterPass(P); }); PIC.registerAfterPassInvalidatedCallback( diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp index 7da592f401273..9105c6fbd230d 100644 --- a/llvm/lib/IR/User.cpp +++ b/llvm/lib/IR/User.cpp @@ -29,7 +29,7 @@ void User::replaceUsesOfWith(Value *From, Value *To) { // The side effects of this setOperand call include linking to // "To", adding "this" to the uses list of To, and // most importantly, removing "this" from the use list of "From". - setOperand(i, To); // Fix it now... + setOperand(i, To); } } diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index efb8d53e8964b..26a983596ab24 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -175,23 +175,36 @@ void Value::dropDroppableUses( for (Use &U : uses()) if (U.getUser()->isDroppable() && ShouldDrop(&U)) ToBeEdited.push_back(&U); - for (Use *U : ToBeEdited) { - U->removeFromList(); - if (auto *Assume = dyn_cast(U->getUser())) { - assert(Assume->getIntrinsicID() == Intrinsic::assume); - unsigned OpNo = U->getOperandNo(); - if (OpNo == 0) - Assume->setOperand(0, ConstantInt::getTrue(Assume->getContext())); - else { - Assume->setOperand(OpNo, UndefValue::get(U->get()->getType())); - CallInst::BundleOpInfo &BOI = Assume->getBundleOpInfoForOperand(OpNo); - BOI.Tag = getContext().pImpl->getOrInsertBundleTag("ignore"); - } - } else - llvm_unreachable("unkown droppable use"); + for (Use *U : ToBeEdited) + dropDroppableUse(*U); +} + +void Value::dropDroppableUsesIn(User &Usr) { + assert(Usr.isDroppable() && "Expected a droppable user!"); + for (Use &UsrOp : Usr.operands()) { + if (UsrOp.get() == this) + dropDroppableUse(UsrOp); } } +void Value::dropDroppableUse(Use &U) { + U.removeFromList(); + if (auto *Assume = dyn_cast(U.getUser())) { + assert(Assume->getIntrinsicID() == Intrinsic::assume); + unsigned OpNo = U.getOperandNo(); + if (OpNo == 0) + U.set(ConstantInt::getTrue(Assume->getContext())); + else { + U.set(UndefValue::get(U.get()->getType())); + CallInst::BundleOpInfo &BOI = Assume->getBundleOpInfoForOperand(OpNo); + BOI.Tag = getContext().pImpl->getOrInsertBundleTag("ignore"); + } + return; + } + + llvm_unreachable("unkown droppable use"); +} + bool Value::isUsedInBasicBlock(const BasicBlock *BB) const { // This can be computed either by scanning the instructions in BB, or by // scanning the use list of this Value. Both lists can be very long, but diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 1ca9d0fe1e18c..b44a36b9713ef 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -1024,9 +1024,13 @@ void ELFWriter::writeSection(const SectionIndexMapTy &SectionIndexMap, } if (Section.getFlags() & ELF::SHF_LINK_ORDER) { + // If the value in the associated metadata is not a definition, Sym will be + // undefined. Represent this with sh_link=0. const MCSymbol *Sym = Section.getLinkedToSymbol(); - const MCSectionELF *Sec = cast(&Sym->getSection()); - sh_link = SectionIndexMap.lookup(Sec); + if (Sym && Sym->isInSection()) { + const MCSectionELF *Sec = cast(&Sym->getSection()); + sh_link = SectionIndexMap.lookup(Sec); + } } WriteSecHdrEntry(StrTabBuilder.getOffset(Section.getName()), @@ -1389,11 +1393,6 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm, if (Flags & ELF::SHF_MERGE) { if (C != 0) return true; - - // It looks like gold has a bug (http://sourceware.org/PR16794) and can - // only handle section relocations to mergeable sections if using RELA. - if (!hasRelocationAddend()) - return true; } // Most TLS relocations use a got, so they need the symbol. Even those that diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 3ca8714b7817c..9515b7e2642bc 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -62,8 +62,8 @@ STATISTIC(EmittedAlignFragments, "Number of emitted assembler fragments - align"); STATISTIC(EmittedFillFragments, "Number of emitted assembler fragments - fill"); -STATISTIC(EmittedOrgFragments, - "Number of emitted assembler fragments - org"); +STATISTIC(EmittedNopsFragments, "Number of emitted assembler fragments - nops"); +STATISTIC(EmittedOrgFragments, "Number of emitted assembler fragments - org"); STATISTIC(evaluateFixup, "Number of evaluated fixups"); STATISTIC(FragmentLayouts, "Number of fragment layouts"); STATISTIC(ObjectBytes, "Number of emitted object file bytes"); @@ -312,6 +312,9 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, return Size; } + case MCFragment::FT_Nops: + return cast(F).getNumBytes(); + case MCFragment::FT_LEB: return cast(F).getContents().size(); @@ -613,6 +616,45 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm, break; } + case MCFragment::FT_Nops: { + ++stats::EmittedNopsFragments; + const MCNopsFragment &NF = cast(F); + int64_t NumBytes = NF.getNumBytes(); + int64_t ControlledNopLength = NF.getControlledNopLength(); + int64_t MaximumNopLength = Asm.getBackend().getMaximumNopSize(); + + assert(NumBytes > 0 && "Expected positive NOPs fragment size"); + assert(ControlledNopLength >= 0 && "Expected non-negative NOP size"); + + if (ControlledNopLength > MaximumNopLength) { + Asm.getContext().reportError(NF.getLoc(), + "illegal NOP size " + + std::to_string(ControlledNopLength) + + ". (expected within [0, " + + std::to_string(MaximumNopLength) + "])"); + // Clamp the NOP length as reportError does not stop the execution + // immediately. + ControlledNopLength = MaximumNopLength; + } + + // Use maximum value if the size of each NOP is not specified + if (!ControlledNopLength) + ControlledNopLength = MaximumNopLength; + + while (NumBytes) { + uint64_t NumBytesToEmit = + (uint64_t)std::min(NumBytes, ControlledNopLength); + assert(NumBytesToEmit && "try to emit empty NOP instruction"); + if (!Asm.getBackend().writeNopData(OS, NumBytesToEmit)) { + report_fatal_error("unable to write nop sequence of the remaining " + + Twine(NumBytesToEmit) + " bytes"); + break; + } + NumBytes -= NumBytesToEmit; + } + break; + } + case MCFragment::FT_LEB: { const MCLEBFragment &LF = cast(F); OS << LF.getContents(); diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index a0f9212f3b142..5b0078974318a 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -90,6 +90,7 @@ void MCContext::reset() { ELFAllocator.DestroyAll(); MachOAllocator.DestroyAll(); XCOFFAllocator.DestroyAll(); + MCInstAllocator.DestroyAll(); MCSubtargetAllocator.DestroyAll(); InlineAsmUsedLabelNames.clear(); @@ -126,6 +127,14 @@ void MCContext::reset() { HadError = false; } +//===----------------------------------------------------------------------===// +// MCInst Management +//===----------------------------------------------------------------------===// + +MCInst *MCContext::createMCInst() { + return new (MCInstAllocator.Allocate()) MCInst; +} + //===----------------------------------------------------------------------===// // Symbol Manipulation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 11fd32e48a4ae..a0aa7a3b95474 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -145,6 +145,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const { case MCBinaryExpr::Mul: OS << '*'; break; case MCBinaryExpr::NE: OS << "!="; break; case MCBinaryExpr::Or: OS << '|'; break; + case MCBinaryExpr::OrNot: OS << '!'; break; case MCBinaryExpr::Shl: OS << "<<"; break; case MCBinaryExpr::Sub: OS << '-'; break; case MCBinaryExpr::Xor: OS << '^'; break; @@ -920,6 +921,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, case MCBinaryExpr::Mul: Result = LHS * RHS; break; case MCBinaryExpr::NE: Result = LHS != RHS; break; case MCBinaryExpr::Or: Result = LHS | RHS; break; + case MCBinaryExpr::OrNot: Result = LHS | ~RHS; break; case MCBinaryExpr::Shl: Result = uint64_t(LHS) << uint64_t(RHS); break; case MCBinaryExpr::Sub: Result = LHS - RHS; break; case MCBinaryExpr::Xor: Result = LHS ^ RHS; break; diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp index 8e90e07a4dbfd..e9cea9d18f2e1 100644 --- a/llvm/lib/MC/MCFragment.cpp +++ b/llvm/lib/MC/MCFragment.cpp @@ -279,6 +279,9 @@ void MCFragment::destroy() { case FT_Fill: delete cast(this); return; + case FT_Nops: + delete cast(this); + return; case FT_Relaxable: delete cast(this); return; @@ -336,6 +339,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { case MCFragment::FT_CompactEncodedInst: OS << "MCCompactEncodedInstFragment"; break; case MCFragment::FT_Fill: OS << "MCFillFragment"; break; + case MCFragment::FT_Nops: + OS << "MCFNopsFragment"; + break; case MCFragment::FT_Relaxable: OS << "MCRelaxableFragment"; break; case MCFragment::FT_Org: OS << "MCOrgFragment"; break; case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break; @@ -408,6 +414,12 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { << " NumValues:" << FF->getNumValues(); break; } + case MCFragment::FT_Nops: { + const auto *NF = cast(this); + OS << " NumBytes:" << NF->getNumBytes() + << " ControlledNopLength:" << NF->getControlledNopLength(); + break; + } case MCFragment::FT_Relaxable: { const auto *F = cast(this); OS << "\n "; diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 78ee215b59aae..f9e0d858cf701 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -819,6 +819,16 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size, insert(new MCFillFragment(Expr, Size, NumValues, Loc)); } +void MCObjectStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLength, + SMLoc Loc) { + // Emit an NOP fragment. + MCDataFragment *DF = getOrCreateDataFragment(); + flushPendingLabels(DF, DF->getContents().size()); + + assert(getCurrentSectionOnly() && "need a section"); + insert(new MCNopsFragment(NumBytes, ControlledNopLength, Loc)); +} + void MCObjectStreamer::emitFileDirective(StringRef Filename) { getAssembler().addFileName(Filename); } diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index c05f26cbdda5c..497f73e411057 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -1497,8 +1497,6 @@ static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K, return 1; // Low Precedence: |, &, ^ - // - // FIXME: gas seems to support '!' as an infix operator? case AsmToken::Pipe: Kind = MCBinaryExpr::Or; return 2; @@ -1559,7 +1557,8 @@ static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K, } } -static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K, +static unsigned getGNUBinOpPrecedence(const MCAsmInfo &MAI, + AsmToken::TokenKind K, MCBinaryExpr::Opcode &Kind, bool ShouldUseLogicalShr) { switch (K) { @@ -1603,12 +1602,18 @@ static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K, Kind = MCBinaryExpr::Sub; return 4; - // High Intermediate Precedence: |, &, ^ + // High Intermediate Precedence: |, !, &, ^ // - // FIXME: gas seems to support '!' as an infix operator? case AsmToken::Pipe: Kind = MCBinaryExpr::Or; return 5; + case AsmToken::Exclaim: + // Hack to support ARM compatible aliases (implied 'sp' operand in 'srs*' + // instructions like 'srsda #31!') and not parse ! as an infix operator. + if (MAI.getCommentString() == "@") + return 0; + Kind = MCBinaryExpr::OrNot; + return 5; case AsmToken::Caret: Kind = MCBinaryExpr::Xor; return 5; @@ -1639,7 +1644,7 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K, MCBinaryExpr::Opcode &Kind) { bool ShouldUseLogicalShr = MAI.shouldUseLogicalShr(); return IsDarwin ? getDarwinBinOpPrecedence(K, Kind, ShouldUseLogicalShr) - : getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr); + : getGNUBinOpPrecedence(MAI, K, Kind, ShouldUseLogicalShr); } /// Parse all binary operators with precedence >= 'Precedence'. diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index e5ab13bc719d4..41779d023a5df 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -450,8 +450,14 @@ bool ELFAsmParser::parseLinkedToSym(MCSymbolELF *&LinkedToSym) { Lex(); StringRef Name; SMLoc StartLoc = L.getLoc(); - if (getParser().parseIdentifier(Name)) + if (getParser().parseIdentifier(Name)) { + if (getParser().getTok().getString() == "0") { + getParser().Lex(); + LinkedToSym = nullptr; + return false; + } return TokError("invalid linked-to symbol"); + } LinkedToSym = dyn_cast_or_null(getContext().lookupSymbol(Name)); if (!LinkedToSym || !LinkedToSym->isInSection()) return Error(StartLoc, "linked-to symbol is not in a section: " + Name); diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index fb3bb11d1f430..fb7aaae295dfa 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -1720,8 +1720,6 @@ static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K, return 4; // High Intermediate Precedence: |, &, ^ - // - // FIXME: gas seems to support '!' as an infix operator? case AsmToken::Pipe: Kind = MCBinaryExpr::Or; return 5; diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp index 77c259c27a04e..7a15556182658 100644 --- a/llvm/lib/MC/MCSectionELF.cpp +++ b/llvm/lib/MC/MCSectionELF.cpp @@ -172,9 +172,11 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T, } if (Flags & ELF::SHF_LINK_ORDER) { - assert(LinkedToSym); OS << ","; - printName(OS, LinkedToSym->getName()); + if (LinkedToSym) + printName(OS, LinkedToSym->getName()); + else + OS << '0'; } if (isUnique()) diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index 6d3a933c96a37..df08c343f69f7 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -202,6 +202,9 @@ void MCStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) { emitFill(*MCConstantExpr::create(NumBytes, getContext()), FillValue); } +void llvm::MCStreamer::emitNops(int64_t NumBytes, int64_t ControlledNopLen, + llvm::SMLoc) {} + /// The implementation in this class just redirects to emitFill. void MCStreamer::emitZeros(uint64_t NumBytes) { emitFill(NumBytes, 0); } diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 4796ef531054b..8e7bf1eb01697 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -375,6 +375,7 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym, COFFSymbol *Local = nullptr; if (cast(MCSym).isWeakExternal()) { Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL; + Sym->Section = nullptr; COFFSymbol *WeakDefault = getLinkedSymbol(MCSym); if (!WeakDefault) { diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt index 61888cbe46f73..9f912ccdff1f1 100644 --- a/llvm/lib/Object/CMakeLists.txt +++ b/llvm/lib/Object/CMakeLists.txt @@ -23,6 +23,7 @@ add_llvm_component_library(LLVMObject SymbolSize.cpp TapiFile.cpp TapiUniversal.cpp + MachOUniversalWriter.cpp WasmObjectFile.cpp WindowsMachineFlag.cpp WindowsResource.cpp diff --git a/llvm/lib/Object/MachOUniversalWriter.cpp b/llvm/lib/Object/MachOUniversalWriter.cpp new file mode 100644 index 0000000000000..169d64430284e --- /dev/null +++ b/llvm/lib/Object/MachOUniversalWriter.cpp @@ -0,0 +1,220 @@ +//===- MachOUniversalWriter.cpp - MachO universal binary writer---*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the Slice class and writeUniversalBinary function for writing a MachO +// universal binary file. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Object/MachOUniversalWriter.h" +#include "llvm/Object/Archive.h" +#include "llvm/Object/Binary.h" +#include "llvm/Object/Error.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/MachOUniversal.h" +#include "llvm/Support/FileOutputBuffer.h" + +using namespace llvm; +using namespace object; + +// For compatibility with cctools lipo, a file's alignment is calculated as the +// minimum aligment of all segments. For object files, the file's alignment is +// the maximum alignment of its sections. +static uint32_t calculateFileAlignment(const MachOObjectFile &O) { + uint32_t P2CurrentAlignment; + uint32_t P2MinAlignment = MachOUniversalBinary::MaxSectionAlignment; + const bool Is64Bit = O.is64Bit(); + + for (const auto &LC : O.load_commands()) { + if (LC.C.cmd != (Is64Bit ? MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT)) + continue; + if (O.getHeader().filetype == MachO::MH_OBJECT) { + unsigned NumberOfSections = + (Is64Bit ? O.getSegment64LoadCommand(LC).nsects + : O.getSegmentLoadCommand(LC).nsects); + P2CurrentAlignment = NumberOfSections ? 2 : P2MinAlignment; + for (unsigned SI = 0; SI < NumberOfSections; ++SI) { + P2CurrentAlignment = std::max(P2CurrentAlignment, + (Is64Bit ? O.getSection64(LC, SI).align + : O.getSection(LC, SI).align)); + } + } else { + P2CurrentAlignment = + countTrailingZeros(Is64Bit ? O.getSegment64LoadCommand(LC).vmaddr + : O.getSegmentLoadCommand(LC).vmaddr); + } + P2MinAlignment = std::min(P2MinAlignment, P2CurrentAlignment); + } + // return a value >= 4 byte aligned, and less than MachO MaxSectionAlignment + return std::max( + static_cast(2), + std::min(P2MinAlignment, static_cast( + MachOUniversalBinary::MaxSectionAlignment))); +} + +static uint32_t calculateAlignment(const MachOObjectFile &ObjectFile) { + switch (ObjectFile.getHeader().cputype) { + case MachO::CPU_TYPE_I386: + case MachO::CPU_TYPE_X86_64: + case MachO::CPU_TYPE_POWERPC: + case MachO::CPU_TYPE_POWERPC64: + return 12; // log2 value of page size(4k) for x86 and PPC + case MachO::CPU_TYPE_ARM: + case MachO::CPU_TYPE_ARM64: + case MachO::CPU_TYPE_ARM64_32: + return 14; // log2 value of page size(16k) for Darwin ARM + default: + return calculateFileAlignment(ObjectFile); + } +} + +Slice::Slice(const MachOObjectFile &O, uint32_t Align) + : B(&O), CPUType(O.getHeader().cputype), + CPUSubType(O.getHeader().cpusubtype), + ArchName(std::string(O.getArchTriple().getArchName())), + P2Alignment(Align) {} + +Slice::Slice(const MachOObjectFile &O) : Slice(O, calculateAlignment(O)) {} + +Expected Slice::create(const Archive *A) { + Error Err = Error::success(); + std::unique_ptr FO = nullptr; + for (const Archive::Child &Child : A->children(Err)) { + Expected> ChildOrErr = Child.getAsBinary(); + if (!ChildOrErr) + return createFileError(A->getFileName(), ChildOrErr.takeError()); + Binary *Bin = ChildOrErr.get().get(); + if (Bin->isMachOUniversalBinary()) + return createStringError(std::errc::invalid_argument, + ("archive member " + Bin->getFileName() + + " is a fat file (not allowed in an archive)") + .str() + .c_str()); + if (!Bin->isMachO()) + return createStringError( + std::errc::invalid_argument, + ("archive member " + Bin->getFileName() + + " is not a MachO file (not allowed in an archive)") + .str() + .c_str()); + MachOObjectFile *O = cast(Bin); + if (FO && std::tie(FO->getHeader().cputype, FO->getHeader().cpusubtype) != + std::tie(O->getHeader().cputype, O->getHeader().cpusubtype)) { + return createStringError( + std::errc::invalid_argument, + ("archive member " + O->getFileName() + " cputype (" + + Twine(O->getHeader().cputype) + ") and cpusubtype(" + + Twine(O->getHeader().cpusubtype) + + ") does not match previous archive members cputype (" + + Twine(FO->getHeader().cputype) + ") and cpusubtype(" + + Twine(FO->getHeader().cpusubtype) + ") (all members must match) " + + FO->getFileName()) + .str() + .c_str()); + } + if (!FO) { + ChildOrErr.get().release(); + FO.reset(O); + } + } + if (Err) + return createFileError(A->getFileName(), std::move(Err)); + if (!FO) + return createStringError( + std::errc::invalid_argument, + ("empty archive with no architecture specification: " + + A->getFileName() + " (can't determine architecture for it)") + .str() + .c_str()); + + Slice ArchiveSlice = Slice(*(FO.get()), FO->is64Bit() ? 3 : 2); + ArchiveSlice.B = A; + return ArchiveSlice; +} + +static Expected> +buildFatArchList(ArrayRef Slices) { + SmallVector FatArchList; + uint64_t Offset = + sizeof(MachO::fat_header) + Slices.size() * sizeof(MachO::fat_arch); + + for (const auto &S : Slices) { + Offset = alignTo(Offset, 1ull << S.getP2Alignment()); + if (Offset > UINT32_MAX) + return createStringError( + std::errc::invalid_argument, + ("fat file too large to be created because the offset " + "field in struct fat_arch is only 32-bits and the offset " + + Twine(Offset) + " for " + S.getBinary()->getFileName() + + " for architecture " + S.getArchString() + "exceeds that.") + .str() + .c_str()); + + MachO::fat_arch FatArch; + FatArch.cputype = S.getCPUType(); + FatArch.cpusubtype = S.getCPUSubType(); + FatArch.offset = Offset; + FatArch.size = S.getBinary()->getMemoryBufferRef().getBufferSize(); + FatArch.align = S.getP2Alignment(); + Offset += FatArch.size; + FatArchList.push_back(FatArch); + } + return FatArchList; +} + +Error object::writeUniversalBinary(ArrayRef Slices, + StringRef OutputFileName) { + MachO::fat_header FatHeader; + FatHeader.magic = MachO::FAT_MAGIC; + FatHeader.nfat_arch = Slices.size(); + + Expected> FatArchListOrErr = + buildFatArchList(Slices); + if (!FatArchListOrErr) + return FatArchListOrErr.takeError(); + SmallVector FatArchList = *FatArchListOrErr; + + const bool IsExecutable = any_of(Slices, [](Slice S) { + return sys::fs::can_execute(S.getBinary()->getFileName()); + }); + const uint64_t OutputFileSize = + static_cast(FatArchList.back().offset) + + FatArchList.back().size; + Expected> OutFileOrError = + FileOutputBuffer::create(OutputFileName, OutputFileSize, + IsExecutable ? FileOutputBuffer::F_executable + : 0); + if (!OutFileOrError) + return createFileError(OutputFileName, OutFileOrError.takeError()); + std::unique_ptr OutFile = std::move(OutFileOrError.get()); + std::memset(OutFile->getBufferStart(), 0, OutputFileSize); + + if (sys::IsLittleEndianHost) + MachO::swapStruct(FatHeader); + std::memcpy(OutFile->getBufferStart(), &FatHeader, sizeof(MachO::fat_header)); + + for (size_t Index = 0, Size = Slices.size(); Index < Size; ++Index) { + MemoryBufferRef BufferRef = Slices[Index].getBinary()->getMemoryBufferRef(); + std::copy(BufferRef.getBufferStart(), BufferRef.getBufferEnd(), + OutFile->getBufferStart() + FatArchList[Index].offset); + } + + // FatArchs written after Slices in order to reduce the number of swaps for + // the LittleEndian case + if (sys::IsLittleEndianHost) + for (MachO::fat_arch &FA : FatArchList) + MachO::swapStruct(FA); + std::memcpy(OutFile->getBufferStart() + sizeof(MachO::fat_header), + FatArchList.begin(), + sizeof(MachO::fat_arch) * FatArchList.size()); + + if (Error E = OutFile->commit()) + return createFileError(OutputFileName, std::move(E)); + + return Error::success(); +} diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp index f61191022fb9b..deff6a68363b2 100644 --- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp +++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/ObjectYAML/DWARFYAML.h" #include "llvm/Support/Errc.h" @@ -125,28 +126,49 @@ Error DWARFYAML::emitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) { } Error DWARFYAML::emitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) { - for (auto Range : DI.ARanges) { - auto HeaderStart = OS.tell(); - writeInitialLength(Range.Format, Range.Length, OS, DI.IsLittleEndian); + assert(DI.DebugAranges && "unexpected emitDebugAranges() call"); + for (auto Range : *DI.DebugAranges) { + uint8_t AddrSize; + if (Range.AddrSize) + AddrSize = *Range.AddrSize; + else + AddrSize = DI.Is64BitAddrSize ? 8 : 4; + + uint64_t Length = 4; // sizeof(version) 2 + sizeof(address_size) 1 + + // sizeof(segment_selector_size) 1 + Length += + Range.Format == dwarf::DWARF64 ? 8 : 4; // sizeof(debug_info_offset) + + const uint64_t HeaderLength = + Length + (Range.Format == dwarf::DWARF64 + ? 12 + : 4); // sizeof(unit_header) = 12 (DWARF64) or 4 (DWARF32) + const uint64_t PaddedHeaderLength = alignTo(HeaderLength, AddrSize * 2); + + if (Range.Length) { + Length = *Range.Length; + } else { + Length += PaddedHeaderLength - HeaderLength; + Length += AddrSize * 2 * (Range.Descriptors.size() + 1); + } + + writeInitialLength(Range.Format, Length, OS, DI.IsLittleEndian); writeInteger((uint16_t)Range.Version, OS, DI.IsLittleEndian); writeDWARFOffset(Range.CuOffset, Range.Format, OS, DI.IsLittleEndian); - writeInteger((uint8_t)Range.AddrSize, OS, DI.IsLittleEndian); + writeInteger((uint8_t)AddrSize, OS, DI.IsLittleEndian); writeInteger((uint8_t)Range.SegSize, OS, DI.IsLittleEndian); - - auto HeaderSize = OS.tell() - HeaderStart; - auto FirstDescriptor = alignTo(HeaderSize, Range.AddrSize * 2); - ZeroFillBytes(OS, FirstDescriptor - HeaderSize); + ZeroFillBytes(OS, PaddedHeaderLength - HeaderLength); for (auto Descriptor : Range.Descriptors) { - if (Error Err = writeVariableSizedInteger( - Descriptor.Address, Range.AddrSize, OS, DI.IsLittleEndian)) + if (Error Err = writeVariableSizedInteger(Descriptor.Address, AddrSize, + OS, DI.IsLittleEndian)) return createStringError(errc::not_supported, "unable to write debug_aranges address: %s", toString(std::move(Err)).c_str()); - cantFail(writeVariableSizedInteger(Descriptor.Length, Range.AddrSize, OS, + cantFail(writeVariableSizedInteger(Descriptor.Length, AddrSize, OS, DI.IsLittleEndian)); } - ZeroFillBytes(OS, Range.AddrSize * 2); + ZeroFillBytes(OS, AddrSize * 2); } return Error::success(); @@ -189,9 +211,8 @@ Error DWARFYAML::emitDebugRanges(raw_ostream &OS, const DWARFYAML::Data &DI) { return Error::success(); } -Error DWARFYAML::emitPubSection(raw_ostream &OS, - const DWARFYAML::PubSection &Sect, - bool IsLittleEndian, bool IsGNUPubSec) { +static Error emitPubSection(raw_ostream &OS, const DWARFYAML::PubSection &Sect, + bool IsLittleEndian, bool IsGNUPubSec = false) { writeInitialLength(Sect.Length, OS, IsLittleEndian); writeInteger((uint16_t)Sect.Version, OS, IsLittleEndian); writeInteger((uint32_t)Sect.UnitOffset, OS, IsLittleEndian); @@ -207,6 +228,28 @@ Error DWARFYAML::emitPubSection(raw_ostream &OS, return Error::success(); } +Error DWARFYAML::emitDebugPubnames(raw_ostream &OS, const Data &DI) { + assert(DI.PubNames && "unexpected emitDebugPubnames() call"); + return emitPubSection(OS, *DI.PubNames, DI.IsLittleEndian); +} + +Error DWARFYAML::emitDebugPubtypes(raw_ostream &OS, const Data &DI) { + assert(DI.PubTypes && "unexpected emitDebugPubtypes() call"); + return emitPubSection(OS, *DI.PubTypes, DI.IsLittleEndian); +} + +Error DWARFYAML::emitDebugGNUPubnames(raw_ostream &OS, const Data &DI) { + assert(DI.GNUPubNames && "unexpected emitDebugGNUPubnames() call"); + return emitPubSection(OS, *DI.GNUPubNames, DI.IsLittleEndian, + /*IsGNUStyle=*/true); +} + +Error DWARFYAML::emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI) { + assert(DI.GNUPubTypes && "unexpected emitDebugGNUPubtypes() call"); + return emitPubSection(OS, *DI.GNUPubTypes, DI.IsLittleEndian, + /*IsGNUStyle=*/true); +} + static Expected writeDIE(ArrayRef AbbrevDecls, const DWARFYAML::Unit &Unit, const DWARFYAML::Entry &Entry, @@ -543,9 +586,9 @@ Error DWARFYAML::emitDebugStrOffsets(raw_ostream &OS, const Data &DI) { return Error::success(); } -static Error checkListEntryOperands(StringRef EncodingString, - ArrayRef Values, - uint64_t ExpectedOperands) { +static Error checkOperandCount(StringRef EncodingString, + ArrayRef Values, + uint64_t ExpectedOperands) { if (Values.size() != ExpectedOperands) return createStringError( errc::invalid_argument, @@ -568,6 +611,39 @@ static Error writeListEntryAddress(StringRef EncodingName, raw_ostream &OS, return Error::success(); } +static Expected +writeDWARFExpression(raw_ostream &OS, + const DWARFYAML::DWARFOperation &Operation, + uint8_t AddrSize, bool IsLittleEndian) { + auto CheckOperands = [&](uint64_t ExpectedOperands) -> Error { + return checkOperandCount(dwarf::OperationEncodingString(Operation.Operator), + Operation.Values, ExpectedOperands); + }; + + uint64_t ExpressionBegin = OS.tell(); + writeInteger((uint8_t)Operation.Operator, OS, IsLittleEndian); + switch (Operation.Operator) { + case dwarf::DW_OP_consts: + if (Error Err = CheckOperands(1)) + return std::move(Err); + encodeSLEB128(Operation.Values[0], OS); + break; + case dwarf::DW_OP_stack_value: + if (Error Err = CheckOperands(0)) + return std::move(Err); + break; + default: + StringRef EncodingStr = dwarf::OperationEncodingString(Operation.Operator); + return createStringError(errc::not_supported, + "DWARF expression: " + + (EncodingStr.empty() + ? "0x" + utohexstr(Operation.Operator) + : EncodingStr) + + " is not supported"); + } + return OS.tell() - ExpressionBegin; +} + static Expected writeListEntry(raw_ostream &OS, const DWARFYAML::RnglistEntry &Entry, uint8_t AddrSize, @@ -578,7 +654,7 @@ static Expected writeListEntry(raw_ostream &OS, StringRef EncodingName = dwarf::RangeListEncodingString(Entry.Operator); auto CheckOperands = [&](uint64_t ExpectedOperands) -> Error { - return checkListEntryOperands(EncodingName, Entry.Values, ExpectedOperands); + return checkOperandCount(EncodingName, Entry.Values, ExpectedOperands); }; auto WriteAddress = [&](uint64_t Addr) -> Error { @@ -629,6 +705,103 @@ static Expected writeListEntry(raw_ostream &OS, return OS.tell() - BeginOffset; } +static Expected writeListEntry(raw_ostream &OS, + const DWARFYAML::LoclistEntry &Entry, + uint8_t AddrSize, + bool IsLittleEndian) { + uint64_t BeginOffset = OS.tell(); + writeInteger((uint8_t)Entry.Operator, OS, IsLittleEndian); + + StringRef EncodingName = dwarf::LocListEncodingString(Entry.Operator); + + auto CheckOperands = [&](uint64_t ExpectedOperands) -> Error { + return checkOperandCount(EncodingName, Entry.Values, ExpectedOperands); + }; + + auto WriteAddress = [&](uint64_t Addr) -> Error { + return writeListEntryAddress(EncodingName, OS, Addr, AddrSize, + IsLittleEndian); + }; + + auto WriteDWARFOperations = [&]() -> Error { + std::string OpBuffer; + raw_string_ostream OpBufferOS(OpBuffer); + uint64_t DescriptionsLength = 0; + + for (const DWARFYAML::DWARFOperation &Op : Entry.Descriptions) { + if (Expected OpSize = + writeDWARFExpression(OpBufferOS, Op, AddrSize, IsLittleEndian)) + DescriptionsLength += *OpSize; + else + return OpSize.takeError(); + } + + if (Entry.DescriptionsLength) + DescriptionsLength = *Entry.DescriptionsLength; + else + DescriptionsLength = OpBuffer.size(); + + encodeULEB128(DescriptionsLength, OS); + OS.write(OpBuffer.data(), OpBuffer.size()); + + return Error::success(); + }; + + switch (Entry.Operator) { + case dwarf::DW_LLE_end_of_list: + if (Error Err = CheckOperands(0)) + return std::move(Err); + break; + case dwarf::DW_LLE_base_addressx: + if (Error Err = CheckOperands(1)) + return std::move(Err); + encodeULEB128(Entry.Values[0], OS); + break; + case dwarf::DW_LLE_startx_endx: + case dwarf::DW_LLE_startx_length: + case dwarf::DW_LLE_offset_pair: + if (Error Err = CheckOperands(2)) + return std::move(Err); + encodeULEB128(Entry.Values[0], OS); + encodeULEB128(Entry.Values[1], OS); + if (Error Err = WriteDWARFOperations()) + return std::move(Err); + break; + case dwarf::DW_LLE_default_location: + if (Error Err = CheckOperands(0)) + return std::move(Err); + if (Error Err = WriteDWARFOperations()) + return std::move(Err); + break; + case dwarf::DW_LLE_base_address: + if (Error Err = CheckOperands(1)) + return std::move(Err); + if (Error Err = WriteAddress(Entry.Values[0])) + return std::move(Err); + break; + case dwarf::DW_LLE_start_end: + if (Error Err = CheckOperands(2)) + return std::move(Err); + if (Error Err = WriteAddress(Entry.Values[0])) + return std::move(Err); + cantFail(WriteAddress(Entry.Values[1])); + if (Error Err = WriteDWARFOperations()) + return std::move(Err); + break; + case dwarf::DW_LLE_start_length: + if (Error Err = CheckOperands(2)) + return std::move(Err); + if (Error Err = WriteAddress(Entry.Values[0])) + return std::move(Err); + encodeULEB128(Entry.Values[1], OS); + if (Error Err = WriteDWARFOperations()) + return std::move(Err); + break; + } + + return OS.tell() - BeginOffset; +} + template Error writeDWARFLists(raw_ostream &OS, ArrayRef> Tables, @@ -658,12 +831,17 @@ Error writeDWARFLists(raw_ostream &OS, for (const DWARFYAML::ListEntries &List : Table.Lists) { Offsets.push_back(ListBufferOS.tell()); - for (const EntryType &Entry : List.Entries) { - Expected EntrySize = - writeListEntry(ListBufferOS, Entry, AddrSize, IsLittleEndian); - if (!EntrySize) - return EntrySize.takeError(); - Length += *EntrySize; + if (List.Content) { + List.Content->writeAsBinary(ListBufferOS, UINT64_MAX); + Length += List.Content->binary_size(); + } else if (List.Entries) { + for (const EntryType &Entry : *List.Entries) { + Expected EntrySize = + writeListEntry(ListBufferOS, Entry, AddrSize, IsLittleEndian); + if (!EntrySize) + return EntrySize.takeError(); + Length += *EntrySize; + } } } @@ -701,7 +879,7 @@ Error writeDWARFLists(raw_ostream &OS, EmitOffsets(ArrayRef((const uint64_t *)Table.Offsets->data(), Table.Offsets->size()), 0); - else + else if (OffsetEntryCount != 0) EmitOffsets(Offsets, OffsetsSize); OS.write(ListBuffer.data(), ListBuffer.size()); @@ -716,14 +894,47 @@ Error DWARFYAML::emitDebugRnglists(raw_ostream &OS, const Data &DI) { OS, *DI.DebugRnglists, DI.IsLittleEndian, DI.Is64BitAddrSize); } -using EmitFuncType = Error (*)(raw_ostream &, const DWARFYAML::Data &); +Error DWARFYAML::emitDebugLoclists(raw_ostream &OS, const Data &DI) { + assert(DI.DebugLoclists && "unexpected emitDebugRnglists() call"); + return writeDWARFLists( + OS, *DI.DebugLoclists, DI.IsLittleEndian, DI.Is64BitAddrSize); +} + +std::function +DWARFYAML::getDWARFEmitterByName(StringRef SecName) { + auto EmitFunc = + StringSwitch< + std::function>(SecName) + .Case("debug_abbrev", DWARFYAML::emitDebugAbbrev) + .Case("debug_addr", DWARFYAML::emitDebugAddr) + .Case("debug_aranges", DWARFYAML::emitDebugAranges) + .Case("debug_gnu_pubnames", DWARFYAML::emitDebugGNUPubnames) + .Case("debug_gnu_pubtypes", DWARFYAML::emitDebugGNUPubtypes) + .Case("debug_info", DWARFYAML::emitDebugInfo) + .Case("debug_line", DWARFYAML::emitDebugLine) + .Case("debug_loclists", DWARFYAML::emitDebugLoclists) + .Case("debug_pubnames", DWARFYAML::emitDebugPubnames) + .Case("debug_pubtypes", DWARFYAML::emitDebugPubtypes) + .Case("debug_ranges", DWARFYAML::emitDebugRanges) + .Case("debug_rnglists", DWARFYAML::emitDebugRnglists) + .Case("debug_str", DWARFYAML::emitDebugStr) + .Case("debug_str_offsets", DWARFYAML::emitDebugStrOffsets) + .Default([&](raw_ostream &, const DWARFYAML::Data &) { + return createStringError(errc::not_supported, + SecName + " is not supported"); + }); + + return EmitFunc; +} static Error -emitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc, - StringRef Sec, +emitDebugSectionImpl(const DWARFYAML::Data &DI, StringRef Sec, StringMap> &OutputBuffers) { std::string Data; raw_string_ostream DebugInfoStream(Data); + + auto EmitFunc = DWARFYAML::getDWARFEmitterByName(Sec); + if (Error Err = EmitFunc(DebugInfoStream, DI)) return Err; DebugInfoStream.flush(); @@ -750,23 +961,12 @@ DWARFYAML::emitDebugSections(StringRef YAMLString, bool IsLittleEndian) { return createStringError(YIn.error(), GeneratedDiag.getMessage()); StringMap> DebugSections; - Error Err = emitDebugSectionImpl(DI, &DWARFYAML::emitDebugInfo, "debug_info", - DebugSections); - Err = joinErrors(std::move(Err), - emitDebugSectionImpl(DI, &DWARFYAML::emitDebugLine, - "debug_line", DebugSections)); - Err = joinErrors(std::move(Err), - emitDebugSectionImpl(DI, &DWARFYAML::emitDebugStr, - "debug_str", DebugSections)); - Err = joinErrors(std::move(Err), - emitDebugSectionImpl(DI, &DWARFYAML::emitDebugAbbrev, - "debug_abbrev", DebugSections)); - Err = joinErrors(std::move(Err), - emitDebugSectionImpl(DI, &DWARFYAML::emitDebugAranges, - "debug_aranges", DebugSections)); - Err = joinErrors(std::move(Err), - emitDebugSectionImpl(DI, &DWARFYAML::emitDebugRanges, - "debug_ranges", DebugSections)); + Error Err = Error::success(); + cantFail(std::move(Err)); + + for (StringRef SecName : DI.getNonEmptySectionNames()) + Err = joinErrors(std::move(Err), + emitDebugSectionImpl(DI, SecName, DebugSections)); if (Err) return std::move(Err); diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp index 186ec8ac4bdc3..a0caad10a36c1 100644 --- a/llvm/lib/ObjectYAML/DWARFYAML.cpp +++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp @@ -17,16 +17,16 @@ namespace llvm { bool DWARFYAML::Data::isEmpty() const { - return DebugStrings.empty() && AbbrevDecls.empty() && ARanges.empty() && + return DebugStrings.empty() && AbbrevDecls.empty() && DebugAranges && DebugRanges.empty() && !PubNames && !PubTypes && !GNUPubNames && !GNUPubTypes && CompileUnits.empty() && DebugLines.empty(); } -SetVector DWARFYAML::Data::getUsedSectionNames() const { +SetVector DWARFYAML::Data::getNonEmptySectionNames() const { SetVector SecNames; if (!DebugStrings.empty()) SecNames.insert("debug_str"); - if (!ARanges.empty()) + if (DebugAranges) SecNames.insert("debug_aranges"); if (!DebugRanges.empty()) SecNames.insert("debug_ranges"); @@ -50,6 +50,8 @@ SetVector DWARFYAML::Data::getUsedSectionNames() const { SecNames.insert("debug_str_offsets"); if (DebugRnglists) SecNames.insert("debug_rnglists"); + if (DebugLoclists) + SecNames.insert("debug_loclists"); return SecNames; } @@ -61,8 +63,7 @@ void MappingTraits::mapping(IO &IO, DWARFYAML::Data &DWARF) { IO.setContext(&DWARFCtx); IO.mapOptional("debug_str", DWARF.DebugStrings); IO.mapOptional("debug_abbrev", DWARF.AbbrevDecls); - if (!DWARF.ARanges.empty() || !IO.outputting()) - IO.mapOptional("debug_aranges", DWARF.ARanges); + IO.mapOptional("debug_aranges", DWARF.DebugAranges); if (!DWARF.DebugRanges.empty() || !IO.outputting()) IO.mapOptional("debug_ranges", DWARF.DebugRanges); IO.mapOptional("debug_pubnames", DWARF.PubNames); @@ -75,6 +76,7 @@ void MappingTraits::mapping(IO &IO, DWARFYAML::Data &DWARF) { IO.mapOptional("debug_addr", DWARF.DebugAddr); IO.mapOptional("debug_str_offsets", DWARF.DebugStrOffsets); IO.mapOptional("debug_rnglists", DWARF.DebugRnglists); + IO.mapOptional("debug_loclists", DWARF.DebugLoclists); IO.setContext(OldContext); } @@ -103,12 +105,12 @@ void MappingTraits::mapping( void MappingTraits::mapping(IO &IO, DWARFYAML::ARange &ARange) { IO.mapOptional("Format", ARange.Format, dwarf::DWARF32); - IO.mapRequired("Length", ARange.Length); + IO.mapOptional("Length", ARange.Length); IO.mapRequired("Version", ARange.Version); IO.mapRequired("CuOffset", ARange.CuOffset); - IO.mapRequired("AddrSize", ARange.AddrSize); - IO.mapRequired("SegSize", ARange.SegSize); - IO.mapRequired("Descriptors", ARange.Descriptors); + IO.mapOptional("AddressSize", ARange.AddrSize); + IO.mapOptional("SegmentSelectorSize", ARange.SegSize, 0); + IO.mapOptional("Descriptors", ARange.Descriptors); } void MappingTraits::mapping( @@ -236,16 +238,39 @@ void MappingTraits::mapping( IO.mapOptional("Offsets", StrOffsetsTable.Offsets); } +void MappingTraits::mapping( + IO &IO, DWARFYAML::DWARFOperation &DWARFOperation) { + IO.mapRequired("Operator", DWARFOperation.Operator); + IO.mapOptional("Values", DWARFOperation.Values); +} + void MappingTraits::mapping( IO &IO, DWARFYAML::RnglistEntry &RnglistEntry) { IO.mapRequired("Operator", RnglistEntry.Operator); IO.mapOptional("Values", RnglistEntry.Values); } +void MappingTraits::mapping( + IO &IO, DWARFYAML::LoclistEntry &LoclistEntry) { + IO.mapRequired("Operator", LoclistEntry.Operator); + IO.mapOptional("Values", LoclistEntry.Values); + IO.mapOptional("DescriptionsLength", LoclistEntry.DescriptionsLength); + IO.mapOptional("Descriptions", LoclistEntry.Descriptions); +} + template void MappingTraits>::mapping( IO &IO, DWARFYAML::ListEntries &ListEntries) { IO.mapOptional("Entries", ListEntries.Entries); + IO.mapOptional("Content", ListEntries.Content); +} + +template +StringRef MappingTraits>::validate( + IO &IO, DWARFYAML::ListEntries &ListEntries) { + if (ListEntries.Entries && ListEntries.Content) + return "Entries and Content can't be used together"; + return StringRef(); } template diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index fc80839064ee0..8cb7af91c56b0 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -355,7 +355,7 @@ ELFState::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH) if (Doc.Symbols) ImplicitSections.push_back(".symtab"); if (Doc.DWARF) - for (StringRef DebugSecName : Doc.DWARF->getUsedSectionNames()) { + for (StringRef DebugSecName : Doc.DWARF->getNonEmptySectionNames()) { std::string SecName = ("." + DebugSecName).str(); ImplicitSections.push_back(StringRef(SecName).copy(StringAlloc)); } @@ -551,6 +551,8 @@ static void overrideFields(ELFYAML::Section *From, typename ELFT::Shdr &To) { To.sh_offset = *From->ShOffset; if (From->ShSize) To.sh_size = *From->ShSize; + if (From->ShType) + To.sh_type = *From->ShType; } template @@ -931,7 +933,7 @@ void ELFState::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name, } static bool shouldEmitDWARF(DWARFYAML::Data &DWARF, StringRef Name) { - SetVector DebugSecNames = DWARF.getUsedSectionNames(); + SetVector DebugSecNames = DWARF.getNonEmptySectionNames(); return Name.consume_front(".") && DebugSecNames.count(Name); } @@ -947,41 +949,9 @@ Expected emitDWARF(typename ELFT::Shdr &SHeader, StringRef Name, return 0; uint64_t BeginOffset = CBA.tell(); - Error Err = Error::success(); - cantFail(std::move(Err)); - - if (Name == ".debug_str") - Err = DWARFYAML::emitDebugStr(*OS, DWARF); - else if (Name == ".debug_aranges") - Err = DWARFYAML::emitDebugAranges(*OS, DWARF); - else if (Name == ".debug_ranges") - Err = DWARFYAML::emitDebugRanges(*OS, DWARF); - else if (Name == ".debug_line") - Err = DWARFYAML::emitDebugLine(*OS, DWARF); - else if (Name == ".debug_addr") - Err = DWARFYAML::emitDebugAddr(*OS, DWARF); - else if (Name == ".debug_abbrev") - Err = DWARFYAML::emitDebugAbbrev(*OS, DWARF); - else if (Name == ".debug_info") - Err = DWARFYAML::emitDebugInfo(*OS, DWARF); - else if (Name == ".debug_pubnames") - Err = DWARFYAML::emitPubSection(*OS, *DWARF.PubNames, DWARF.IsLittleEndian); - else if (Name == ".debug_pubtypes") - Err = DWARFYAML::emitPubSection(*OS, *DWARF.PubTypes, DWARF.IsLittleEndian); - else if (Name == ".debug_gnu_pubnames") - Err = DWARFYAML::emitPubSection(*OS, *DWARF.GNUPubNames, - DWARF.IsLittleEndian, /*IsGNUStyle=*/true); - else if (Name == ".debug_gnu_pubtypes") - Err = DWARFYAML::emitPubSection(*OS, *DWARF.GNUPubTypes, - DWARF.IsLittleEndian, /*IsGNUStyle=*/true); - else if (Name == ".debug_str_offsets") - Err = DWARFYAML::emitDebugStrOffsets(*OS, DWARF); - else if (Name == ".debug_rnglists") - Err = DWARFYAML::emitDebugRnglists(*OS, DWARF); - else - llvm_unreachable("unexpected emitDWARF() call"); - if (Err) + auto EmitFunc = DWARFYAML::getDWARFEmitterByName(Name.substr(1)); + if (Error Err = EmitFunc(*OS, DWARF)) return std::move(Err); return CBA.tell() - BeginOffset; diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index f85d6a5d30efa..f460a387540d2 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1094,11 +1094,13 @@ static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) { // automatically when they are not explicitly defined. assert(!IO.outputting() || (!Section.ShOffset.hasValue() && !Section.ShSize.hasValue() && - !Section.ShName.hasValue() && !Section.ShFlags.hasValue())); + !Section.ShName.hasValue() && !Section.ShFlags.hasValue() && + !Section.ShType.hasValue())); IO.mapOptional("ShName", Section.ShName); IO.mapOptional("ShOffset", Section.ShOffset); IO.mapOptional("ShSize", Section.ShSize); IO.mapOptional("ShFlags", Section.ShFlags); + IO.mapOptional("ShType", Section.ShType); } static void sectionMapping(IO &IO, ELFYAML::DynamicSection &Section) { diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp index 680264484704b..3b1421440cb99 100644 --- a/llvm/lib/ObjectYAML/MachOEmitter.cpp +++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp @@ -29,7 +29,7 @@ namespace { class MachOWriter { public: - MachOWriter(MachOYAML::Object &Obj) : Obj(Obj), is64Bit(true), fileStart(0) { + MachOWriter(MachOYAML::Object &Obj) : Obj(Obj), fileStart(0) { is64Bit = Obj.Header.magic == MachO::MH_MAGIC_64 || Obj.Header.magic == MachO::MH_CIGAM_64; memset(reinterpret_cast(&Header), 0, sizeof(MachO::mach_header_64)); @@ -293,18 +293,17 @@ Error MachOWriter::writeSectionData(raw_ostream &OS) { Err = DWARFYAML::emitDebugStr(OS, Obj.DWARF); else if (0 == strncmp(&Sec.sectname[0], "__debug_abbrev", 16)) Err = DWARFYAML::emitDebugAbbrev(OS, Obj.DWARF); - else if (0 == strncmp(&Sec.sectname[0], "__debug_aranges", 16)) - Err = DWARFYAML::emitDebugAranges(OS, Obj.DWARF); - else if (0 == strncmp(&Sec.sectname[0], "__debug_ranges", 16)) + else if (0 == strncmp(&Sec.sectname[0], "__debug_aranges", 16)) { + if (Obj.DWARF.DebugAranges) + Err = DWARFYAML::emitDebugAranges(OS, Obj.DWARF); + } else if (0 == strncmp(&Sec.sectname[0], "__debug_ranges", 16)) Err = DWARFYAML::emitDebugRanges(OS, Obj.DWARF); else if (0 == strncmp(&Sec.sectname[0], "__debug_pubnames", 16)) { if (Obj.DWARF.PubNames) - Err = DWARFYAML::emitPubSection(OS, *Obj.DWARF.PubNames, - Obj.IsLittleEndian); + Err = DWARFYAML::emitDebugPubnames(OS, Obj.DWARF); } else if (0 == strncmp(&Sec.sectname[0], "__debug_pubtypes", 16)) { if (Obj.DWARF.PubTypes) - Err = DWARFYAML::emitPubSection(OS, *Obj.DWARF.PubTypes, - Obj.IsLittleEndian); + Err = DWARFYAML::emitDebugPubtypes(OS, Obj.DWARF); } else if (0 == strncmp(&Sec.sectname[0], "__debug_info", 16)) Err = DWARFYAML::emitDebugInfo(OS, Obj.DWARF); else if (0 == strncmp(&Sec.sectname[0], "__debug_line", 16)) diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt index 371a21c113dcb..35fde7ba4e28d 100644 --- a/llvm/lib/Passes/CMakeLists.txt +++ b/llvm/lib/Passes/CMakeLists.txt @@ -1,7 +1,3 @@ -if (MSVC) - set_source_files_properties(PassBuilder.cpp PROPERTIES COMPILE_FLAGS /bigobj) -endif() - add_llvm_component_library(LLVMPasses PassBuilder.cpp PassPlugin.cpp diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 771675d35fdf2..b9d1e76fcf198 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -106,6 +106,7 @@ #include "llvm/Transforms/Instrumentation/BoundsChecking.h" #include "llvm/Transforms/Instrumentation/CGProfile.h" #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" +#include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" @@ -185,6 +186,7 @@ #include "llvm/Transforms/Utils/LCSSA.h" #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" #include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Utils/LowerInvoke.h" #include "llvm/Transforms/Utils/Mem2Reg.h" #include "llvm/Transforms/Utils/NameAnonGlobals.h" @@ -1144,11 +1146,14 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline( // convert to more optimized IR using more aggressive simplify CFG options. // The extra sinking transform can create larger basic blocks, so do this // before SLP vectorization. - OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions(). - forwardSwitchCondToPhi(true). - convertSwitchToLookupTable(true). - needCanonicalLoops(false). - sinkCommonInsts(true))); + // FIXME: study whether hoisting and/or sinking of common instructions should + // be delayed until after SLP vectorizer. + OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); // Optimize parallel scalar instruction chains into SIMD instructions. if (PTO.SLPVectorization) diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index edaca9ebf6090..7490ebfb29d56 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -81,7 +81,7 @@ MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(dbgs())) MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(dbgs())) MODULE_PASS("rewrite-statepoints-for-gc", RewriteStatepointsForGC()) MODULE_PASS("rewrite-symbols", RewriteSymbolPass()) -MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass()) +MODULE_PASS("rpo-function-attrs", ReversePostOrderFunctionAttrsPass()) MODULE_PASS("sample-profile", SampleProfileLoaderPass()) MODULE_PASS("scc-oz-module-inliner", buildInlinerPipeline(OptimizationLevel::Oz, ThinLTOPhase::None, DebugLogging)) @@ -89,6 +89,7 @@ MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass()) MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation()) MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass(nullptr, nullptr)) MODULE_PASS("verify", VerifierPass()) +MODULE_PASS("dfsan", DataFlowSanitizerPass()) MODULE_PASS("asan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/false, false, true, false)) MODULE_PASS("msan-module", MemorySanitizerPass({})) MODULE_PASS("tsan-module", ThreadSanitizerPass()) @@ -157,7 +158,7 @@ FUNCTION_ALIAS_ANALYSIS("cfl-anders-aa", CFLAndersAA()) FUNCTION_ALIAS_ANALYSIS("cfl-steens-aa", CFLSteensAA()) FUNCTION_ALIAS_ANALYSIS("scev-aa", SCEVAA()) FUNCTION_ALIAS_ANALYSIS("scoped-noalias-aa", ScopedNoAliasAA()) -FUNCTION_ALIAS_ANALYSIS("type-based-aa", TypeBasedAA()) +FUNCTION_ALIAS_ANALYSIS("tbaa", TypeBasedAA()) #undef FUNCTION_ALIAS_ANALYSIS #undef FUNCTION_ANALYSIS @@ -224,6 +225,7 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass()) FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass()) FUNCTION_PASS("loop-fusion", LoopFusePass()) FUNCTION_PASS("loop-distribute", LoopDistributePass()) +FUNCTION_PASS("loop-versioning", LoopVersioningPass()) FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt()) FUNCTION_PASS("print", PrintFunctionPass(dbgs())) FUNCTION_PASS("print", AssumptionPrinterPass(dbgs())) diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 22a7103715d56..d4a6784985a23 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" +#include using namespace llvm; @@ -35,6 +36,13 @@ static cl::opt cl::desc("Enable skipping optional passes optnone functions " "under new pass manager")); +// FIXME: Change `-debug-pass-manager` from boolean to enum type. Similar to +// `-debug-pass` in legacy PM. +static cl::opt + DebugPMVerbose("debug-pass-manager-verbose", cl::Hidden, cl::init(false), + cl::desc("Print all pass management debugging information. " + "`-debug-pass-manager` must also be specified")); + namespace { /// Extracting Module out of \p IR unit. Also fills a textual description @@ -78,14 +86,25 @@ Optional> unwrapModule(Any IR) { llvm_unreachable("Unknown IR unit"); } -void printIR(const Function *F, StringRef Banner, - StringRef Extra = StringRef()) { +void printIR(const Function *F, StringRef Banner, StringRef Extra = StringRef(), + bool Brief = false) { + if (Brief) { + dbgs() << F->getName() << '\n'; + return; + } + if (!llvm::isFunctionInPrintList(F->getName())) return; dbgs() << Banner << Extra << "\n" << static_cast(*F); } -void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef()) { +void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef(), + bool Brief = false) { + if (Brief) { + dbgs() << M->getName() << '\n'; + return; + } + if (llvm::isFunctionInPrintList("*") || llvm::forcePrintModuleIR()) { dbgs() << Banner << Extra << "\n"; M->print(dbgs(), nullptr, false); @@ -97,7 +116,12 @@ void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef()) { } void printIR(const LazyCallGraph::SCC *C, StringRef Banner, - StringRef Extra = StringRef()) { + StringRef Extra = StringRef(), bool Brief = false) { + if (Brief) { + dbgs() << *C << '\n'; + return; + } + bool BannerPrinted = false; for (const LazyCallGraph::Node &N : *C) { const Function &F = N.getFunction(); @@ -110,7 +134,13 @@ void printIR(const LazyCallGraph::SCC *C, StringRef Banner, } } } -void printIR(const Loop *L, StringRef Banner) { + +void printIR(const Loop *L, StringRef Banner, bool Brief = false) { + if (Brief) { + dbgs() << *L; + return; + } + const Function *F = L->getHeader()->getParent(); if (!llvm::isFunctionInPrintList(F->getName())) return; @@ -119,7 +149,8 @@ void printIR(const Loop *L, StringRef Banner) { /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into /// llvm::Any and does actual print job. -void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false) { +void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false, + bool Brief = false) { if (ForceModule) { if (auto UnwrappedModule = unwrapModule(IR)) printIR(UnwrappedModule->first, Banner, UnwrappedModule->second); @@ -129,14 +160,14 @@ void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false) { if (any_isa(IR)) { const Module *M = any_cast(IR); assert(M && "module should be valid for printing"); - printIR(M, Banner); + printIR(M, Banner, "", Brief); return; } if (any_isa(IR)) { const Function *F = any_cast(IR); assert(F && "function should be valid for printing"); - printIR(F, Banner); + printIR(F, Banner, "", Brief); return; } @@ -144,14 +175,14 @@ void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false) { const LazyCallGraph::SCC *C = any_cast(IR); assert(C && "scc should be valid for printing"); std::string Extra = std::string(formatv(" (scc: {0})", C->getName())); - printIR(C, Banner, Extra); + printIR(C, Banner, Extra, Brief); return; } if (any_isa(IR)) { const Loop *L = any_cast(IR); assert(L && "Loop should be valid for printing"); - printIR(L, Banner); + printIR(L, Banner, Brief); return; } llvm_unreachable("Unknown wrapped IR type"); @@ -180,9 +211,9 @@ PrintIRInstrumentation::popModuleDesc(StringRef PassID) { return ModuleDesc; } -bool PrintIRInstrumentation::printBeforePass(StringRef PassID, Any IR) { +void PrintIRInstrumentation::printBeforePass(StringRef PassID, Any IR) { if (PassID.startswith("PassManager<") || PassID.contains("PassAdaptor<")) - return true; + return; // Saving Module for AfterPassInvalidated operations. // Note: here we rely on a fact that we do not change modules while @@ -192,11 +223,11 @@ bool PrintIRInstrumentation::printBeforePass(StringRef PassID, Any IR) { pushModuleDesc(PassID, IR); if (!llvm::shouldPrintBeforePass(PassID)) - return true; + return; SmallString<20> Banner = formatv("*** IR Dump Before {0} ***", PassID); unwrapAndPrint(IR, Banner, llvm::forcePrintModuleIR()); - return true; + return; } void PrintIRInstrumentation::printAfterPass(StringRef PassID, Any IR) { @@ -240,8 +271,8 @@ void PrintIRInstrumentation::registerCallbacks( // for later use in AfterPassInvalidated. StoreModuleDesc = llvm::forcePrintModuleIR() && llvm::shouldPrintAfterPass(); if (llvm::shouldPrintBeforePass() || StoreModuleDesc) - PIC.registerBeforePassCallback( - [this](StringRef P, Any IR) { return this->printBeforePass(P, IR); }); + PIC.registerBeforeNonSkippedPassCallback( + [this](StringRef P, Any IR) { this->printBeforePass(P, IR); }); if (llvm::shouldPrintAfterPass()) { PIC.registerAfterPassCallback( @@ -274,9 +305,34 @@ bool OptNoneInstrumentation::skip(StringRef PassID, Any IR) { return true; } +void PrintPassInstrumentation::registerCallbacks( + PassInstrumentationCallbacks &PIC) { + if (!DebugLogging) + return; + + std::vector SpecialPasses = {"PassManager"}; + if (!DebugPMVerbose) + SpecialPasses.emplace_back("PassAdaptor"); + + PIC.registerBeforeNonSkippedPassCallback( + [SpecialPasses](StringRef PassID, Any IR) { + if (isSpecialPass(PassID, SpecialPasses)) + return; + + dbgs() << "Running pass: " << PassID << " on "; + unwrapAndPrint(IR, "", false, true); + }); + + PIC.registerBeforeAnalysisCallback([](StringRef PassID, Any IR) { + dbgs() << "Running analysis: " << PassID << " on "; + unwrapAndPrint(IR, "", false, true); + }); +} + void StandardInstrumentations::registerCallbacks( PassInstrumentationCallbacks &PIC) { PrintIR.registerCallbacks(PIC); + PrintPass.registerCallbacks(PIC); TimePasses.registerCallbacks(PIC); OptNone.registerCallbacks(PIC); } diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 9b429bf37d742..a172f319b502c 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -625,11 +625,11 @@ void InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input, } } -void InstrProfValueSiteRecord::scale(uint64_t Weight, +void InstrProfValueSiteRecord::scale(uint64_t N, uint64_t D, function_ref Warn) { for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) { bool Overflowed; - I->Count = SaturatingMultiply(I->Count, Weight, &Overflowed); + I->Count = SaturatingMultiply(I->Count, N, &Overflowed) / D; if (Overflowed) Warn(instrprof_error::counter_overflow); } @@ -678,22 +678,23 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight, } void InstrProfRecord::scaleValueProfData( - uint32_t ValueKind, uint64_t Weight, + uint32_t ValueKind, uint64_t N, uint64_t D, function_ref Warn) { for (auto &R : getValueSitesForKind(ValueKind)) - R.scale(Weight, Warn); + R.scale(N, D, Warn); } -void InstrProfRecord::scale(uint64_t Weight, +void InstrProfRecord::scale(uint64_t N, uint64_t D, function_ref Warn) { + assert(D != 0 && "D cannot be 0"); for (auto &Count : this->Counts) { bool Overflowed; - Count = SaturatingMultiply(Count, Weight, &Overflowed); + Count = SaturatingMultiply(Count, N, &Overflowed) / D; if (Overflowed) Warn(instrprof_error::counter_overflow); } for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) - scaleValueProfData(Kind, Weight, Warn); + scaleValueProfData(Kind, N, D, Warn); } // Map indirect call target name hash to name string. @@ -1111,6 +1112,8 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) { return true; } +// FIXME: This is to be removed after switching to the new memop value +// profiling. // Parse the value profile options. void getMemOPSizeRangeFromOption(StringRef MemOPSizeRange, int64_t &RangeStart, int64_t &RangeLast) { diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 88445f186e835..d07668322354e 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -241,7 +241,7 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash, // We've never seen a function with this name and hash, add it. Dest = std::move(I); if (Weight > 1) - Dest.scale(Weight, MapWarn); + Dest.scale(Weight, 1, MapWarn); } else { // We're updating a function we've seen before. Dest.merge(I, Weight, MapWarn); diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp index 5d3a076409427..d2603097c550b 100644 --- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp +++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp @@ -119,13 +119,22 @@ std::unique_ptr InstrProfSummaryBuilder::getSummary() { } void InstrProfSummaryBuilder::addEntryCount(uint64_t Count) { - addCount(Count); NumFunctions++; + + // Skip invalid count. + if (Count == (uint64_t)-1) + return; + + addCount(Count); if (Count > MaxFunctionCount) MaxFunctionCount = Count; } void InstrProfSummaryBuilder::addInternalCount(uint64_t Count) { + // Skip invalid count. + if (Count == (uint64_t)-1) + return; + addCount(Count); if (Count > MaxInternalBlockCount) MaxInternalBlockCount = Count; diff --git a/llvm/lib/Support/ARMTargetParser.cpp b/llvm/lib/Support/ARMTargetParser.cpp index 56a91f7dc7876..751f84475f42c 100644 --- a/llvm/lib/Support/ARMTargetParser.cpp +++ b/llvm/lib/Support/ARMTargetParser.cpp @@ -490,9 +490,10 @@ static unsigned findDoublePrecisionFPU(unsigned InputFPUKind) { return ARM::FK_INVALID; } -bool ARM::appendArchExtFeatures( - StringRef CPU, ARM::ArchKind AK, StringRef ArchExt, - std::vector &Features) { +bool ARM::appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, + StringRef ArchExt, + std::vector &Features, + unsigned &ArgFPUID) { size_t StartingNumFeatures = Features.size(); const bool Negated = stripNegationPrefix(ArchExt); @@ -527,6 +528,7 @@ bool ARM::appendArchExtFeatures( } else { FPUKind = getDefaultFPU(CPU, AK); } + ArgFPUID = FPUKind; return ARM::getFPUFeatures(FPUKind, Features); } return StartingNumFeatures != Features.size(); diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index 8c04fa3fc0b54..53ad806d818f5 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -202,8 +202,11 @@ add_llvm_component_library(LLVMSupport set(llvm_system_libs ${system_libs}) if(LLVM_ENABLE_ZLIB) - string(TOUPPER ${CMAKE_BUILD_TYPE} build_type) - get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION_${build_type}) + # CMAKE_BUILD_TYPE is only meaningful to single-configuration generators. + if(CMAKE_BUILD_TYPE) + string(TOUPPER ${CMAKE_BUILD_TYPE} build_type) + get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION_${build_type}) + endif() if(NOT zlib_library) get_property(zlib_library TARGET ZLIB::ZLIB PROPERTY LOCATION) endif() diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 12ef0d511b147..4fba6a9ada2c0 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -1271,36 +1271,6 @@ bool cl::readConfigFile(StringRef CfgFile, StringSaver &Saver, /*MarkEOLs*/ false, /*RelativeNames*/ true); } -/// ParseEnvironmentOptions - An alternative entry point to the -/// CommandLine library, which allows you to read the program's name -/// from the caller (as PROGNAME) and its command-line arguments from -/// an environment variable (whose name is given in ENVVAR). -/// -void cl::ParseEnvironmentOptions(const char *progName, const char *envVar, - const char *Overview) { - // Check args. - assert(progName && "Program name not specified"); - assert(envVar && "Environment variable name missing"); - - // Get the environment variable they want us to parse options out of. - llvm::Optional envValue = sys::Process::GetEnv(StringRef(envVar)); - if (!envValue) - return; - - // Get program's "name", which we wouldn't know without the caller - // telling us. - SmallVector newArgv; - BumpPtrAllocator A; - StringSaver Saver(A); - newArgv.push_back(Saver.save(progName).data()); - - // Parse the value of the environment variable into a "command line" - // and hand it off to ParseCommandLineOptions(). - TokenizeGNUCommandLine(*envValue, Saver, newArgv); - int newArgc = static_cast(newArgv.size()); - ParseCommandLineOptions(newArgc, &newArgv[0], StringRef(Overview)); -} - bool cl::ParseCommandLineOptions(int argc, const char *const *argv, StringRef Overview, raw_ostream *Errs, const char *EnvVar, diff --git a/llvm/lib/Support/FileCheck.cpp b/llvm/lib/Support/FileCheck.cpp index d0e79c675bcbd..137eea0a65a64 100644 --- a/llvm/lib/Support/FileCheck.cpp +++ b/llvm/lib/Support/FileCheck.cpp @@ -15,6 +15,7 @@ #include "llvm/Support/FileCheck.h" #include "FileCheckImpl.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/CheckedArithmetic.h" @@ -1218,7 +1219,7 @@ Expected Pattern::match(StringRef Buffer, size_t &MatchLen, Format.valueFromStringRepr(MatchedValue, SM); if (!Value) return Value.takeError(); - DefinedNumericVariable->setValue(*Value); + DefinedNumericVariable->setValue(*Value, MatchedValue); } // Like CHECK-NEXT, CHECK-EMPTY's match range is considered to start after @@ -1247,7 +1248,9 @@ unsigned Pattern::computeMatchDistance(StringRef Buffer) const { } void Pattern::printSubstitutions(const SourceMgr &SM, StringRef Buffer, - SMRange MatchRange) const { + SMRange Range, + FileCheckDiag::MatchType MatchTy, + std::vector *Diags) const { // Print what we know about substitutions. if (!Substitutions.empty()) { for (const auto &Substitution : Substitutions) { @@ -1280,29 +1283,86 @@ void Pattern::printSubstitutions(const SourceMgr &SM, StringRef Buffer, OS.write_escaped(*MatchedValue) << "\""; } - if (MatchRange.isValid()) - SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note, OS.str(), - {MatchRange}); + // We report only the start of the match/search range to suggest we are + // reporting the substitutions as set at the start of the match/search. + // Indicating a non-zero-length range might instead seem to imply that the + // substitution matches or was captured from exactly that range. + if (Diags) + Diags->emplace_back(SM, CheckTy, getLoc(), MatchTy, + SMRange(Range.Start, Range.Start), OS.str()); else - SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), - SourceMgr::DK_Note, OS.str()); + SM.PrintMessage(Range.Start, SourceMgr::DK_Note, OS.str()); } } } +void Pattern::printVariableDefs(const SourceMgr &SM, + FileCheckDiag::MatchType MatchTy, + std::vector *Diags) const { + if (VariableDefs.empty() && NumericVariableDefs.empty()) + return; + // Build list of variable captures. + struct VarCapture { + StringRef Name; + SMRange Range; + }; + SmallVector VarCaptures; + for (const auto &VariableDef : VariableDefs) { + VarCapture VC; + VC.Name = VariableDef.first; + StringRef Value = Context->GlobalVariableTable[VC.Name]; + SMLoc Start = SMLoc::getFromPointer(Value.data()); + SMLoc End = SMLoc::getFromPointer(Value.data() + Value.size()); + VC.Range = SMRange(Start, End); + VarCaptures.push_back(VC); + } + for (const auto &VariableDef : NumericVariableDefs) { + VarCapture VC; + VC.Name = VariableDef.getKey(); + StringRef StrValue = VariableDef.getValue() + .DefinedNumericVariable->getStringValue() + .getValue(); + SMLoc Start = SMLoc::getFromPointer(StrValue.data()); + SMLoc End = SMLoc::getFromPointer(StrValue.data() + StrValue.size()); + VC.Range = SMRange(Start, End); + VarCaptures.push_back(VC); + } + // Sort variable captures by the order in which they matched the input. + // Ranges shouldn't be overlapping, so we can just compare the start. + std::sort(VarCaptures.begin(), VarCaptures.end(), + [](const VarCapture &A, const VarCapture &B) { + assert(A.Range.Start != B.Range.Start && + "unexpected overlapping variable captures"); + return A.Range.Start.getPointer() < B.Range.Start.getPointer(); + }); + // Create notes for the sorted captures. + for (const VarCapture &VC : VarCaptures) { + SmallString<256> Msg; + raw_svector_ostream OS(Msg); + OS << "captured var \"" << VC.Name << "\""; + if (Diags) + Diags->emplace_back(SM, CheckTy, getLoc(), MatchTy, VC.Range, OS.str()); + else + SM.PrintMessage(VC.Range.Start, SourceMgr::DK_Note, OS.str(), VC.Range); + } +} + static SMRange ProcessMatchResult(FileCheckDiag::MatchType MatchTy, const SourceMgr &SM, SMLoc Loc, Check::FileCheckType CheckTy, StringRef Buffer, size_t Pos, size_t Len, std::vector *Diags, - bool AdjustPrevDiag = false) { + bool AdjustPrevDiags = false) { SMLoc Start = SMLoc::getFromPointer(Buffer.data() + Pos); SMLoc End = SMLoc::getFromPointer(Buffer.data() + Pos + Len); SMRange Range(Start, End); if (Diags) { - if (AdjustPrevDiag) - Diags->rbegin()->MatchTy = MatchTy; - else + if (AdjustPrevDiags) { + SMLoc CheckLoc = Diags->rbegin()->CheckLoc; + for (auto I = Diags->rbegin(), E = Diags->rend(); + I != E && I->CheckLoc == CheckLoc; ++I) + I->MatchTy = MatchTy; + } else Diags->emplace_back(SM, CheckTy, Loc, MatchTy, Range); } return Range; @@ -1455,8 +1515,8 @@ StringRef FileCheck::CanonicalizeFile(MemoryBuffer &MB, FileCheckDiag::FileCheckDiag(const SourceMgr &SM, const Check::FileCheckType &CheckTy, SMLoc CheckLoc, MatchType MatchTy, - SMRange InputRange) - : CheckTy(CheckTy), CheckLoc(CheckLoc), MatchTy(MatchTy) { + SMRange InputRange, StringRef Note) + : CheckTy(CheckTy), CheckLoc(CheckLoc), MatchTy(MatchTy), Note(Note) { auto Start = SM.getLineAndColumn(InputRange.Start); auto End = SM.getLineAndColumn(InputRange.End); InputStartLine = Start.first; @@ -1519,9 +1579,7 @@ FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix) { StringRef Rest = Buffer.drop_front(Prefix.size() + 1); // Check for comment. - if (Req.CommentPrefixes.end() != std::find(Req.CommentPrefixes.begin(), - Req.CommentPrefixes.end(), - Prefix)) { + if (llvm::is_contained(Req.CommentPrefixes, Prefix)) { if (NextChar == ':') return {Check::CheckComment, Rest}; // Ignore a comment prefix if it has a suffix like "-NOT". @@ -1863,10 +1921,15 @@ static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM, // diagnostics. PrintDiag = !Diags; } - SMRange MatchRange = ProcessMatchResult( - ExpectedMatch ? FileCheckDiag::MatchFoundAndExpected - : FileCheckDiag::MatchFoundButExcluded, - SM, Loc, Pat.getCheckTy(), Buffer, MatchPos, MatchLen, Diags); + FileCheckDiag::MatchType MatchTy = ExpectedMatch + ? FileCheckDiag::MatchFoundAndExpected + : FileCheckDiag::MatchFoundButExcluded; + SMRange MatchRange = ProcessMatchResult(MatchTy, SM, Loc, Pat.getCheckTy(), + Buffer, MatchPos, MatchLen, Diags); + if (Diags) { + Pat.printSubstitutions(SM, Buffer, MatchRange, MatchTy, Diags); + Pat.printVariableDefs(SM, MatchTy, Diags); + } if (!PrintDiag) return; @@ -1881,7 +1944,8 @@ static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM, Loc, ExpectedMatch ? SourceMgr::DK_Remark : SourceMgr::DK_Error, Message); SM.PrintMessage(MatchRange.Start, SourceMgr::DK_Note, "found here", {MatchRange}); - Pat.printSubstitutions(SM, Buffer, MatchRange); + Pat.printSubstitutions(SM, Buffer, MatchRange, MatchTy, nullptr); + Pat.printVariableDefs(SM, MatchTy, nullptr); } static void PrintMatch(bool ExpectedMatch, const SourceMgr &SM, @@ -1914,10 +1978,13 @@ static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM, // If the current position is at the end of a line, advance to the start of // the next line. Buffer = Buffer.substr(Buffer.find_first_not_of(" \t\n\r")); - SMRange SearchRange = ProcessMatchResult( - ExpectedMatch ? FileCheckDiag::MatchNoneButExpected - : FileCheckDiag::MatchNoneAndExcluded, - SM, Loc, Pat.getCheckTy(), Buffer, 0, Buffer.size(), Diags); + FileCheckDiag::MatchType MatchTy = ExpectedMatch + ? FileCheckDiag::MatchNoneButExpected + : FileCheckDiag::MatchNoneAndExcluded; + SMRange SearchRange = ProcessMatchResult(MatchTy, SM, Loc, Pat.getCheckTy(), + Buffer, 0, Buffer.size(), Diags); + if (Diags) + Pat.printSubstitutions(SM, Buffer, SearchRange, MatchTy, Diags); if (!PrintDiag) { consumeError(std::move(MatchErrors)); return; @@ -1945,7 +2012,7 @@ static void PrintNoMatch(bool ExpectedMatch, const SourceMgr &SM, SM.PrintMessage(SearchRange.Start, SourceMgr::DK_Note, "scanning from here"); // Allow the pattern to print additional information if desired. - Pat.printSubstitutions(SM, Buffer); + Pat.printSubstitutions(SM, Buffer, SearchRange, MatchTy, nullptr); if (ExpectedMatch) Pat.printFuzzyMatch(SM, Buffer, Diags); @@ -2248,8 +2315,12 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer, SM.PrintMessage(OldStart, SourceMgr::DK_Note, "match discarded, overlaps earlier DAG match here", {OldRange}); - } else - Diags->rbegin()->MatchTy = FileCheckDiag::MatchFoundButDiscarded; + } else { + SMLoc CheckLoc = Diags->rbegin()->CheckLoc; + for (auto I = Diags->rbegin(), E = Diags->rend(); + I != E && I->CheckLoc == CheckLoc; ++I) + I->MatchTy = FileCheckDiag::MatchFoundButDiscarded; + } } MatchPos = MI->End; } diff --git a/llvm/lib/Support/FileCheckImpl.h b/llvm/lib/Support/FileCheckImpl.h index 6ca67ec2964c2..edb0c2548308d 100644 --- a/llvm/lib/Support/FileCheckImpl.h +++ b/llvm/lib/Support/FileCheckImpl.h @@ -261,6 +261,10 @@ class NumericVariable { /// Value of numeric variable, if defined, or None otherwise. Optional Value; + /// The input buffer's string from which Value was parsed, or None. See + /// comments on getStringValue for a discussion of the None case. + Optional StrValue; + /// Line number where this variable is defined, or None if defined before /// input is parsed. Used to determine whether a variable is defined on the /// same line as a given use. @@ -284,12 +288,28 @@ class NumericVariable { /// \returns this variable's value. Optional getValue() const { return Value; } - /// Sets value of this numeric variable to \p NewValue. - void setValue(ExpressionValue NewValue) { Value = NewValue; } + /// \returns the input buffer's string from which this variable's value was + /// parsed, or None if the value is not yet defined or was not parsed from the + /// input buffer. For example, the value of @LINE is not parsed from the + /// input buffer, and some numeric variables are parsed from the command + /// line instead. + Optional getStringValue() const { return StrValue; } + + /// Sets value of this numeric variable to \p NewValue, and sets the input + /// buffer string from which it was parsed to \p NewStrValue. See comments on + /// getStringValue for a discussion of when the latter can be None. + void setValue(ExpressionValue NewValue, + Optional NewStrValue = None) { + Value = NewValue; + StrValue = NewStrValue; + } /// Clears value of this numeric variable, regardless of whether it is /// currently defined or not. - void clearValue() { Value = None; } + void clearValue() { + Value = None; + StrValue = None; + } /// \returns the line number where this variable is defined, if any, or None /// if defined before input is parsed. @@ -683,13 +703,16 @@ class Pattern { /// Prints the value of successful substitutions or the name of the undefined /// string or numeric variables preventing a successful substitution. void printSubstitutions(const SourceMgr &SM, StringRef Buffer, - SMRange MatchRange = None) const; + SMRange MatchRange, FileCheckDiag::MatchType MatchTy, + std::vector *Diags) const; void printFuzzyMatch(const SourceMgr &SM, StringRef Buffer, std::vector *Diags) const; bool hasVariable() const { return !(Substitutions.empty() && VariableDefs.empty()); } + void printVariableDefs(const SourceMgr &SM, FileCheckDiag::MatchType MatchTy, + std::vector *Diags) const; Check::FileCheckType getCheckTy() const { return CheckTy; } diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index 658c1ee74cfec..d3b255ae0f2ec 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1271,6 +1271,15 @@ int computeHostNumPhysicalCores() { } return CPU_COUNT(&Enabled); } +#elif defined(__linux__) && defined(__powerpc__) +int computeHostNumPhysicalCores() { + cpu_set_t Affinity; + if (sched_getaffinity(0, sizeof(Affinity), &Affinity) != 0) + return -1; + return CPU_COUNT(&Affinity); +} +#elif defined(__linux__) && defined(__s390x__) +int computeHostNumPhysicalCores() { return sysconf(_SC_NPROCESSORS_ONLN); } #elif defined(__APPLE__) && defined(__x86_64__) #include #include diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp index e8fa71f5f69e8..fa1372623d03b 100644 --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -171,8 +171,6 @@ StringRef Triple::getVendorTypeName(VendorType Kind) { case AMD: return "amd"; case Apple: return "apple"; - case BGP: return "bgp"; - case BGQ: return "bgq"; case CSR: return "csr"; case Freescale: return "fsl"; case IBM: return "ibm"; @@ -199,7 +197,6 @@ StringRef Triple::getOSTypeName(OSType Kind) { case AMDHSA: return "amdhsa"; case AMDPAL: return "amdpal"; case Ananas: return "ananas"; - case CNK: return "cnk"; case CUDA: return "cuda"; case CloudABI: return "cloudabi"; case Contiki: return "contiki"; @@ -491,8 +488,6 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("apple", Triple::Apple) .Case("pc", Triple::PC) .Case("scei", Triple::SCEI) - .Case("bgp", Triple::BGP) - .Case("bgq", Triple::BGQ) .Case("fsl", Triple::Freescale) .Case("ibm", Triple::IBM) .Case("img", Triple::ImaginationTechnologies) @@ -530,7 +525,6 @@ static Triple::OSType parseOS(StringRef OSName) { .StartsWith("minix", Triple::Minix) .StartsWith("rtems", Triple::RTEMS) .StartsWith("nacl", Triple::NaCl) - .StartsWith("cnk", Triple::CNK) .StartsWith("aix", Triple::AIX) .StartsWith("cuda", Triple::CUDA) .StartsWith("nvcl", Triple::NVCL) diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index d91b269cc6d33..01903ea10e814 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -33,6 +33,7 @@ #include #include +#include #ifdef __APPLE__ #include @@ -791,6 +792,16 @@ std::error_code setLastAccessAndModificationTime(int FD, TimePoint<> AccessTime, if (::futimes(FD, Times)) return std::error_code(errno, std::generic_category()); return std::error_code(); +#elif defined(__MVS__) + attrib_t Attr; + memset(&Attr, 0, sizeof(Attr)); + Attr.att_atimechg = 1; + Attr.att_atime = sys::toTimeT(AccessTime); + Attr.att_mtimechg = 1; + Attr.att_mtime = sys::toTimeT(ModificationTime); + if (::__fchattr(FD, &Attr, sizeof(Attr)) != 0) + return std::error_code(errno, std::generic_category()); + return std::error_code(); #else #warning Missing futimes() and futimens() return make_error_code(errc::function_not_supported); @@ -1078,6 +1089,50 @@ Expected readNativeFileSlice(file_t FD, MutableArrayRef Buf, return NumRead; } +std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { + auto Start = std::chrono::steady_clock::now(); + auto End = Start + Timeout; + do { + struct flock Lock; + memset(&Lock, 0, sizeof(Lock)); + Lock.l_type = F_WRLCK; + Lock.l_whence = SEEK_SET; + Lock.l_start = 0; + Lock.l_len = 0; + if (::fcntl(FD, F_SETLK, &Lock) != -1) + return std::error_code(); + int Error = errno; + if (Error != EACCES && Error != EAGAIN) + return std::error_code(Error, std::generic_category()); + usleep(1000); + } while (std::chrono::steady_clock::now() < End); + return make_error_code(errc::no_lock_available); +} + +std::error_code lockFile(int FD) { + struct flock Lock; + memset(&Lock, 0, sizeof(Lock)); + Lock.l_type = F_WRLCK; + Lock.l_whence = SEEK_SET; + Lock.l_start = 0; + Lock.l_len = 0; + if (::fcntl(FD, F_SETLKW, &Lock) != -1) + return std::error_code(); + int Error = errno; + return std::error_code(Error, std::generic_category()); +} + +std::error_code unlockFile(int FD) { + struct flock Lock; + Lock.l_type = F_UNLCK; + Lock.l_whence = SEEK_SET; + Lock.l_start = 0; + Lock.l_len = 0; + if (::fcntl(FD, F_SETLK, &Lock) != -1) + return std::error_code(); + return std::error_code(errno, std::generic_category()); +} + std::error_code closeFile(file_t &F) { file_t TmpF = F; F = kInvalidFile; diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc index f68374d29f023..ce1fccf0b4271 100644 --- a/llvm/lib/Support/Unix/Signals.inc +++ b/llvm/lib/Support/Unix/Signals.inc @@ -382,8 +382,7 @@ static RETSIGTYPE SignalHandler(int Sig) { OneShotPipeSignalFunction.exchange(nullptr)) return OldOneShotPipeFunction(); - if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig) - != std::end(IntSigs)) { + if (llvm::is_contained(IntSigs, Sig)) { if (auto OldInterruptFunction = InterruptFunction.exchange(nullptr)) return OldInterruptFunction(); diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index e352beb77616b..3570d1d6e0563 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -1260,6 +1260,43 @@ Expected readNativeFileSlice(file_t FileHandle, return readNativeFileImpl(FileHandle, Buf, &Overlapped); } +std::error_code tryLockFile(int FD, std::chrono::milliseconds Timeout) { + DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY; + OVERLAPPED OV = {0}; + file_t File = convertFDToNativeFile(FD); + auto Start = std::chrono::steady_clock::now(); + auto End = Start + Timeout; + do { + if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV)) + return std::error_code(); + DWORD Error = ::GetLastError(); + if (Error == ERROR_LOCK_VIOLATION) { + ::Sleep(1); + continue; + } + return mapWindowsError(Error); + } while (std::chrono::steady_clock::now() < End); + return mapWindowsError(ERROR_LOCK_VIOLATION); +} + +std::error_code lockFile(int FD) { + DWORD Flags = LOCKFILE_EXCLUSIVE_LOCK; + OVERLAPPED OV = {0}; + file_t File = convertFDToNativeFile(FD); + if (::LockFileEx(File, Flags, 0, MAXDWORD, MAXDWORD, &OV)) + return std::error_code(); + DWORD Error = ::GetLastError(); + return mapWindowsError(Error); +} + +std::error_code unlockFile(int FD) { + OVERLAPPED OV = {0}; + file_t File = convertFDToNativeFile(FD); + if (::UnlockFileEx(File, 0, MAXDWORD, MAXDWORD, &OV)) + return std::error_code(); + return mapWindowsError(::GetLastError()); +} + std::error_code closeFile(file_t &F) { file_t TmpF = F; F = kInvalidFile; diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index f2d78d7732397..86c48993957a0 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -861,6 +861,21 @@ bool raw_fd_ostream::has_colors() const { return sys::Process::FileDescriptorHasColors(FD); } +Expected raw_fd_ostream::lock() { + std::error_code EC = sys::fs::lockFile(FD); + if (!EC) + return sys::fs::FileLocker(FD); + return errorCodeToError(EC); +} + +Expected +raw_fd_ostream::tryLockFor(std::chrono::milliseconds Timeout) { + std::error_code EC = sys::fs::tryLockFile(FD, Timeout); + if (!EC) + return sys::fs::FileLocker(FD); + return errorCodeToError(EC); +} + void raw_fd_ostream::anchor() {} //===----------------------------------------------------------------------===// diff --git a/llvm/lib/TableGen/Error.cpp b/llvm/lib/TableGen/Error.cpp index 54b063cb4f8d2..1dfba9fb6b5dc 100644 --- a/llvm/lib/TableGen/Error.cpp +++ b/llvm/lib/TableGen/Error.cpp @@ -45,6 +45,13 @@ void PrintNote(ArrayRef NoteLoc, const Twine &Msg) { PrintMessage(NoteLoc, SourceMgr::DK_Note, Msg); } +void PrintFatalNote(ArrayRef NoteLoc, const Twine &Msg) { + PrintNote(NoteLoc, Msg); + // The following call runs the file cleanup handlers. + sys::RunInterruptHandlers(); + std::exit(1); +} + void PrintWarning(ArrayRef WarningLoc, const Twine &Msg) { PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg); } diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 538863ebe95af..5d421cfbc2da8 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -54,7 +54,7 @@ using namespace llvm; -#define DEBUG_TYPE "falkor-hwpf-fix" +#define DEBUG_TYPE "aarch64-falkor-hwpf-fix" STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); STATISTIC(NumCollisionsAvoided, @@ -224,10 +224,10 @@ struct LoadInfo { char FalkorHWPFFix::ID = 0; -INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late", +INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", "Falkor HW Prefetch Fix Late Phase", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late", +INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", "Falkor HW Prefetch Fix Late Phase", false, false) static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) { diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index efa3fd5ca9cef..30666009801c5 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -148,6 +148,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -399,12 +400,102 @@ static bool ShouldSignReturnAddress(MachineFunction &MF) { return false; } +// Convenience function to create a DWARF expression for +// Expr + NumBytes + NumVGScaledBytes * AArch64::VG +static void appendVGScaledOffsetExpr(SmallVectorImpl &Expr, + int NumBytes, int NumVGScaledBytes, unsigned VG, + llvm::raw_string_ostream &Comment) { + uint8_t buffer[16]; + + if (NumBytes) { + Expr.push_back(dwarf::DW_OP_consts); + Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); + Expr.push_back((uint8_t)dwarf::DW_OP_plus); + Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); + } + + if (NumVGScaledBytes) { + Expr.push_back((uint8_t)dwarf::DW_OP_consts); + Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); + + Expr.push_back((uint8_t)dwarf::DW_OP_bregx); + Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); + Expr.push_back(0); + + Expr.push_back((uint8_t)dwarf::DW_OP_mul); + Expr.push_back((uint8_t)dwarf::DW_OP_plus); + + Comment << (NumVGScaledBytes < 0 ? " - " : " + ") + << std::abs(NumVGScaledBytes) << " * VG"; + } +} + +// Creates an MCCFIInstruction: +// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } +MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP( + const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const { + int64_t NumBytes, NumVGScaledBytes; + OffsetFromSP.getForDwarfOffset(NumBytes, NumVGScaledBytes); + + std::string CommentBuffer = "sp"; + llvm::raw_string_ostream Comment(CommentBuffer); + + // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG) + SmallString<64> Expr; + Expr.push_back(dwarf::DW_OP_breg0 + /*SP*/ 31); + Expr.push_back(0); + appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, + TRI.getDwarfRegNum(AArch64::VG, true), Comment); + + // Wrap this into DW_CFA_def_cfa. + SmallString<64> DefCfaExpr; + DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); + uint8_t buffer[16]; + DefCfaExpr.append(buffer, + buffer + encodeULEB128(Expr.size(), buffer)); + DefCfaExpr.append(Expr.str()); + return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), + Comment.str()); +} + +MCCFIInstruction AArch64FrameLowering::createCfaOffset( + const TargetRegisterInfo &TRI, unsigned Reg, + const StackOffset &OffsetFromDefCFA) const { + int64_t NumBytes, NumVGScaledBytes; + OffsetFromDefCFA.getForDwarfOffset(NumBytes, NumVGScaledBytes); + + unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); + + // Non-scalable offsets can use DW_CFA_offset directly. + if (!NumVGScaledBytes) + return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); + + std::string CommentBuffer; + llvm::raw_string_ostream Comment(CommentBuffer); + Comment << printReg(Reg, &TRI) << " @ cfa"; + + // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) + SmallString<64> OffsetExpr; + appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, + TRI.getDwarfRegNum(AArch64::VG, true), Comment); + + // Wrap this into DW_CFA_expression + SmallString<64> CfaExpr; + CfaExpr.push_back(dwarf::DW_CFA_expression); + uint8_t buffer[16]; + CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); + CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); + CfaExpr.append(OffsetExpr.str()); + + return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str()); +} + void AArch64FrameLowering::emitCalleeSavedFrameMoves( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetSubtargetInfo &STI = MF.getSubtarget(); - const MCRegisterInfo *MRI = STI.getRegisterInfo(); + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); const TargetInstrInfo *TII = STI.getInstrInfo(); DebugLoc DL = MBB.findDebugLoc(MBBI); @@ -415,11 +506,26 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( for (const auto &Info : CSI) { unsigned Reg = Info.getReg(); - int64_t Offset = - MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea(); - unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + + // Not all unwinders may know about SVE registers, so assume the lowest + // common demoninator. + unsigned NewReg; + if (static_cast(TRI)->regNeedsCFI(Reg, NewReg)) + Reg = NewReg; + else + continue; + + StackOffset Offset; + if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::SVEVector) { + AArch64FunctionInfo *AFI = MF.getInfo(); + Offset = StackOffset(MFI.getObjectOffset(Info.getFrameIdx()), MVT::nxv1i8) - + StackOffset(AFI->getCalleeSavedStackSize(MFI), MVT::i8); + } else { + Offset = {MFI.getObjectOffset(Info.getFrameIdx()) - + getOffsetOfLocalArea(), + MVT::i8}; + } + unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1192,7 +1298,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Process the SVE callee-saves to determine what space needs to be // allocated. - if (AFI->getSVECalleeSavedStackSize()) { + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1200,11 +1306,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; CalleeSavesEnd = MBBI; - int64_t OffsetToFirstCalleeSaveFromSP = - MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); - StackOffset OffsetToCalleeSavesFromSP = - StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; - AllocateBefore -= OffsetToCalleeSavesFromSP; + AllocateBefore = {CalleeSavedSize, MVT::nxv1i8}; AllocateAfter = SVEStackSize - AllocateBefore; } @@ -1387,9 +1489,18 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } else { - // Encode the stack size of the leaf function. - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize())); + unsigned CFIIndex; + if (SVEStackSize) { + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + StackOffset TotalSize = + SVEStackSize + StackOffset((int64_t)MFI.getStackSize(), MVT::i8); + CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize)); + } else { + // Encode the stack size of the leaf function. + CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize())); + } BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1582,7 +1693,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // deallocated. StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; - if (AFI->getSVECalleeSavedStackSize()) { + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { RestoreBegin = std::prev(RestoreEnd);; while (IsSVECalleeSave(RestoreBegin) && RestoreBegin != MBB.begin()) @@ -1592,23 +1703,21 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, assert(IsSVECalleeSave(RestoreBegin) && IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); - int64_t OffsetToFirstCalleeSaveFromSP = - MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); - StackOffset OffsetToCalleeSavesFromSP = - StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; - DeallocateBefore = OffsetToCalleeSavesFromSP; - DeallocateAfter = SVEStackSize - DeallocateBefore; + StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8}; + DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; + DeallocateAfter = CalleeSavedSizeAsOffset; } // Deallocate the SVE area. if (SVEStackSize) { if (AFI->isStackRealigned()) { - if (AFI->getSVECalleeSavedStackSize()) - // Set SP to start of SVE area, from which the callee-save reloads - // can be done. The code below will deallocate the stack space + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) + // Set SP to start of SVE callee-save area from which they can + // be reloaded. The code below will deallocate the stack space // space by moving FP -> SP. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, - -SVEStackSize, TII, MachineInstr::FrameDestroy); + {-CalleeSavedSize, MVT::nxv1i8}, TII, + MachineInstr::FrameDestroy); } else { if (AFI->getSVECalleeSavedStackSize()) { // Deallocate the non-SVE locals first before we can deallocate (and @@ -2012,6 +2121,7 @@ static void computeCalleeSaveRegisterPairs( // available unwind codes. This flag assures that the alignment fixup is done // only once, as intened. bool FixupDone = false; + for (unsigned i = 0; i < Count; ++i) { RegPairInfo RPI; RPI.Reg1 = CSI[i].getReg(); @@ -2595,25 +2705,23 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex, bool AssignOffsets) { +#ifndef NDEBUG // First process all fixed stack objects. - int64_t Offset = 0; for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - if (MFI.getStackID(I) == TargetStackID::SVEVector) { - int64_t FixedOffset = -MFI.getObjectOffset(I); - if (FixedOffset > Offset) - Offset = FixedOffset; - } + assert(MFI.getStackID(I) != TargetStackID::SVEVector && + "SVE vectors should never be passed on the stack by value, only by " + "reference."); +#endif auto Assign = [&MFI](int FI, int64_t Offset) { LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); MFI.setObjectOffset(FI, Offset); }; + int64_t Offset = 0; + // Then process all callee saved slots. if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { - // Make sure to align the last callee save slot. - MFI.setObjectAlignment(MaxCSFrameIndex, Align(16)); - // Assign offsets to the callee save slots. for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { Offset += MFI.getObjectSize(I); @@ -2623,6 +2731,9 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, } } + // Ensure that the Callee-save area is aligned to 16bytes. + Offset = alignTo(Offset, Align(16U)); + // Create a buffer of SVE objects to allocate and sort it. SmallVector ObjectsToAllocate; for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 9d0a6d9eaf255..1ca8c3e9e2bf6 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -18,6 +18,8 @@ namespace llvm { +class MCCFIInstruction; + class AArch64FrameLowering : public TargetFrameLowering { public: explicit AArch64FrameLowering() @@ -105,6 +107,12 @@ class AArch64FrameLowering : public TargetFrameLowering { } } + bool isStackIdSafeForLocalArea(unsigned StackId) const override { + // We don't support putting SVE objects into the pre-allocated local + // frame block at the moment. + return StackId != TargetStackID::SVEVector; + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, uint64_t StackBumpBytes) const; @@ -113,6 +121,11 @@ class AArch64FrameLowering : public TargetFrameLowering { int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, int &MinCSFrameIndex, int &MaxCSFrameIndex) const; + MCCFIInstruction + createDefCFAExpressionFromSP(const TargetRegisterInfo &TRI, + const StackOffset &OffsetFromSP) const; + MCCFIInstruction createCfaOffset(const TargetRegisterInfo &MRI, unsigned DwarfReg, + const StackOffset &OffsetFromDefCFA) const; bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, unsigned StackBumpBytes) const; }; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 10c4778533533..7799ebfbd68e6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -245,7 +245,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { unsigned SubRegIdx); void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc); + void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale, + unsigned Opc_rr, unsigned Opc_ri); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. @@ -262,14 +263,12 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - template - void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr, - const unsigned Opc_ri); - template + void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale, + unsigned Opc_rr, unsigned Opc_ri); std::tuple - findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, - const unsigned Opc_ri, const SDValue &OldBase, - const SDValue &OldOffset); + findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri, + const SDValue &OldBase, const SDValue &OldOffset, + unsigned Scale); bool tryBitfieldExtractOp(SDNode *N); bool tryBitfieldExtractOpFromSExt(SDNode *N); @@ -1414,12 +1413,12 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, /// Optimize \param OldBase and \param OldOffset selecting the best addressing /// mode. Returns a tuple consisting of an Opcode, an SDValue representing the /// new Base and an SDValue representing the new offset. -template std::tuple -AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, - const unsigned Opc_ri, +AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, + unsigned Opc_ri, const SDValue &OldBase, - const SDValue &OldOffset) { + const SDValue &OldOffset, + unsigned Scale) { SDValue NewBase = OldBase; SDValue NewOffset = OldOffset; // Detect a possible Reg+Imm addressing mode. @@ -1429,21 +1428,30 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr, // Detect a possible reg+reg addressing mode, but only if we haven't already // detected a Reg+Imm one. const bool IsRegReg = - !IsRegImm && SelectSVERegRegAddrMode(OldBase, NewBase, NewOffset); + !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset); // Select the instruction. return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset); } void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, - const unsigned Opc) { + unsigned Scale, unsigned Opc_ri, + unsigned Opc_rr) { + assert(Scale < 4 && "Invalid scaling value."); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); + // Optimize addressing mode. + SDValue Base, Offset; + unsigned Opc; + std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( + N, Opc_rr, Opc_ri, N->getOperand(2), + CurDAG->getTargetConstant(0, DL, MVT::i64), Scale); + SDValue Ops[] = {N->getOperand(1), // Predicate - N->getOperand(2), // Memory operand - CurDAG->getTargetConstant(0, DL, MVT::i64), Chain}; + Base, // Memory operand + Offset, Chain}; const EVT ResTys[] = {MVT::Untyped, MVT::Other}; @@ -1479,10 +1487,9 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, ReplaceNode(N, St); } -template void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, - const unsigned Opc_rr, - const unsigned Opc_ri) { + unsigned Scale, unsigned Opc_rr, + unsigned Opc_ri) { SDLoc dl(N); // Form a REG_SEQUENCE to force register allocation. @@ -1492,9 +1499,9 @@ void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, // Optimize addressing mode. unsigned Opc; SDValue Offset, Base; - std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( + std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore( N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3), - CurDAG->getTargetConstant(0, dl, MVT::i64)); + CurDAG->getTargetConstant(0, dl, MVT::i64), Scale); SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate Base, // address @@ -4085,63 +4092,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case Intrinsic::aarch64_sve_st2: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore(Node, 2, AArch64::ST2B, - AArch64::ST2B_IMM); + SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore(Node, 2, AArch64::ST2H, - AArch64::ST2H_IMM); + SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore(Node, 2, AArch64::ST2W, - AArch64::ST2W_IMM); + SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore(Node, 2, AArch64::ST2D, - AArch64::ST2D_IMM); + SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM); return; } break; } case Intrinsic::aarch64_sve_st3: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore(Node, 3, AArch64::ST3B, - AArch64::ST3B_IMM); + SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore(Node, 3, AArch64::ST3H, - AArch64::ST3H_IMM); + SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore(Node, 3, AArch64::ST3W, - AArch64::ST3W_IMM); + SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore(Node, 3, AArch64::ST3D, - AArch64::ST3D_IMM); + SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM); return; } break; } case Intrinsic::aarch64_sve_st4: { if (VT == MVT::nxv16i8) { - SelectPredicatedStore(Node, 4, AArch64::ST4B, - AArch64::ST4B_IMM); + SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedStore(Node, 4, AArch64::ST4H, - AArch64::ST4H_IMM); + SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedStore(Node, 4, AArch64::ST4W, - AArch64::ST4W_IMM); + SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedStore(Node, 4, AArch64::ST4D, - AArch64::ST4D_IMM); + SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM); return; } break; @@ -4741,51 +4736,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { } case AArch64ISD::SVE_LD2_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM); + SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM); + SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM); + SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM); + SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D); return; } break; } case AArch64ISD::SVE_LD3_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM); + SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM); + SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM); + SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM); + SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D); return; } break; } case AArch64ISD::SVE_LD4_MERGE_ZERO: { if (VT == MVT::nxv16i8) { - SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM); + SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B); return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { - SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM); + SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM); + SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W); return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM); + SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D); return; } break; @@ -4805,10 +4800,14 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM, /// When \p PredVT is a scalable vector predicate in the form /// MVT::nxxi1, it builds the correspondent scalable vector of -/// integers MVT::nxxi s.t. M x bits = 128. If the input +/// integers MVT::nxxi s.t. M x bits = 128. When targeting +/// structured vectors (NumVec >1), the output data type is +/// MVT::nxxi s.t. M x bits = 128. If the input /// PredVT is not in the form MVT::nxxi1, it returns an invalid /// EVT. -static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { +static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT, + unsigned NumVec) { + assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors."); if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1) return EVT(); @@ -4818,7 +4817,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) { ElementCount EC = PredVT.getVectorElementCount(); EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min); - EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC); + EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec); + return MemVT; } @@ -4842,6 +4842,15 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { return cast(Root->getOperand(3))->getVT(); case AArch64ISD::ST1_PRED: return cast(Root->getOperand(4))->getVT(); + case AArch64ISD::SVE_LD2_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/2); + case AArch64ISD::SVE_LD3_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/3); + case AArch64ISD::SVE_LD4_MERGE_ZERO: + return getPackedVectorTypeFromPredicateType( + Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/4); default: break; } @@ -4857,7 +4866,7 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) { // We are using an SVE prefetch intrinsic. Type must be inferred // from the width of the predicate. return getPackedVectorTypeFromPredicateType( - Ctx, Root->getOperand(2)->getValueType(0)); + Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1); } /// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index af77171e92849..402d7656ca215 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -230,7 +230,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MVT::nxv2f64 }) { setCondCodeAction(ISD::SETO, VT, Expand); setCondCodeAction(ISD::SETOLT, VT, Expand); + setCondCodeAction(ISD::SETLT, VT, Expand); setCondCodeAction(ISD::SETOLE, VT, Expand); + setCondCodeAction(ISD::SETLE, VT, Expand); setCondCodeAction(ISD::SETULT, VT, Expand); setCondCodeAction(ISD::SETULE, VT, Expand); setCondCodeAction(ISD::SETUGE, VT, Expand); @@ -1395,14 +1397,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) MAKE_CASE(AArch64ISD::ADD_PRED) MAKE_CASE(AArch64ISD::SDIV_PRED) + MAKE_CASE(AArch64ISD::SHL_PRED) + MAKE_CASE(AArch64ISD::SMAX_PRED) + MAKE_CASE(AArch64ISD::SMIN_PRED) + MAKE_CASE(AArch64ISD::SRA_PRED) + MAKE_CASE(AArch64ISD::SRL_PRED) MAKE_CASE(AArch64ISD::UDIV_PRED) - MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1) - MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1) - MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1) - MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1) - MAKE_CASE(AArch64ISD::SHL_MERGE_OP1) - MAKE_CASE(AArch64ISD::SRL_MERGE_OP1) - MAKE_CASE(AArch64ISD::SRA_MERGE_OP1) + MAKE_CASE(AArch64ISD::UMAX_PRED) + MAKE_CASE(AArch64ISD::UMIN_PRED) MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) MAKE_CASE(AArch64ISD::ADC) MAKE_CASE(AArch64ISD::SBC) @@ -3538,13 +3540,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::UDIV: return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED); case ISD::SMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); case ISD::UMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED); case ISD::SMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED); case ISD::UMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED); case ISD::SRA: case ISD::SRL: case ISD::SHL: @@ -8912,7 +8914,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, case ISD::SHL: if (VT.isScalableVector()) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), @@ -8924,8 +8926,8 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, case ISD::SRA: case ISD::SRL: if (VT.isScalableVector()) { - unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1 - : AArch64ISD::SRL_MERGE_OP1; + unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED + : AArch64ISD::SRL_PRED; return LowerToPredicatedOp(Op, DAG, Opc); } @@ -11938,6 +11940,25 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, Zero); } +// If a merged operation has no inactive lanes we can relax it to a predicated +// or unpredicated operation, which potentially allows better isel (perhaps +// using immediate forms) or relaxing register reuse requirements. +static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc, + SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); + assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); + SDValue Pg = N->getOperand(1); + + // ISD way to specify an all active predicate. + if ((Pg.getOpcode() == AArch64ISD::PTRUE) && + (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all)) + return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg, + N->getOperand(2), N->getOperand(3)); + + // FUTURE: SplatVector(true) + return SDValue(); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -12016,26 +12037,19 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); case Intrinsic::aarch64_sve_smin: - return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2), N->getOperand(3)); + return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); case Intrinsic::aarch64_sve_umin: - return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2), N->getOperand(3)); + return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG); case Intrinsic::aarch64_sve_smax: - return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2), N->getOperand(3)); + return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG); case Intrinsic::aarch64_sve_umax: - return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2), N->getOperand(3)); + return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG); case Intrinsic::aarch64_sve_lsl: - return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2), N->getOperand(3)); + return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG); case Intrinsic::aarch64_sve_lsr: - return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2), N->getOperand(3)); + return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); case Intrinsic::aarch64_sve_asr: - return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2), N->getOperand(3)); + return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); case Intrinsic::aarch64_sve_cmphs: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), @@ -12347,6 +12361,9 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { "Unsupported opcode."); SDLoc DL(N); EVT VT = N->getValueType(0); + if (VT == MVT::nxv8bf16 && + !static_cast(DAG.getSubtarget()).hasBF16()) + return SDValue(); EVT LoadVT = VT; if (VT.isFloatingPoint()) @@ -14970,6 +14987,11 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { if (isa(Inst.getOperand(i)->getType())) return true; + if (const AllocaInst *AI = dyn_cast(&Inst)) { + if (isa(AI->getAllocatedType())) + return true; + } + return false; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 344ed96d79609..c346debb823aa 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -72,7 +72,7 @@ enum NodeType : unsigned { ADC, SBC, // adc, sbc instructions - // Arithmetic instructions + // Predicated instructions where inactive lanes produce undefined results. ADD_PRED, FADD_PRED, FDIV_PRED, @@ -80,14 +80,14 @@ enum NodeType : unsigned { FMUL_PRED, FSUB_PRED, SDIV_PRED, + SHL_PRED, + SMAX_PRED, + SMIN_PRED, + SRA_PRED, + SRL_PRED, UDIV_PRED, - SMIN_MERGE_OP1, - UMIN_MERGE_OP1, - SMAX_MERGE_OP1, - UMAX_MERGE_OP1, - SHL_MERGE_OP1, - SRL_MERGE_OP1, - SRA_MERGE_OP1, + UMAX_PRED, + UMIN_PRED, SETCC_MERGE_ZERO, @@ -462,12 +462,6 @@ class AArch64TargetLowering : public TargetLowering { SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - /// Returns true if a cast between SrcAS and DestAS is a noop. - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { - // Addrspacecasts are always noops. - return true; - } - /// This method returns a target specific FastISel object, or null if the /// target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 6df7970f4d82b..088c129bc5f3c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -495,6 +495,9 @@ def SImmS4XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); }]>; +def SImmS32XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64); +}]>; // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. @@ -546,7 +549,7 @@ def simm4s16 : Operand, ImmLeaf, ImmLeaf=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> { +[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> { let PrintMethod = "printImmScale<32>"; let ParserMatchClass = SImm4s32Operand; let DecoderMethod = "DecodeSImm<4>"; @@ -1934,11 +1937,21 @@ class SignAuthTwoOperand opc, string asm, let Inst{4-0} = Rd; } +class ClearAuth data, string asm> + : I<(outs GPR64:$Rd), (ins GPR64:$Rn), asm, "\t$Rd", "$Rd = $Rn", []>, Sched<[]> { + bits<5> Rd; + let Inst{31-11} = 0b110110101100000101000; + let Inst{10} = data; + let Inst{9-5} = 0b11111; + let Inst{4-0} = Rd; +} + // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions class BaseFlagManipulation : I<(outs), iops, asm, ops, "", []>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; + let Defs = [NZCV]; bits<5> Rn; let Inst{31} = sf; let Inst{30-15} = 0b0111010000000000; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 08f80c9aa361b..b6fda6b367bf2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3861,7 +3861,7 @@ static bool isCombineInstrCandidate64(unsigned Opc) { return false; } -// FP Opcodes that can be combined with a FMUL +// FP Opcodes that can be combined with a FMUL. static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { switch (Inst.getOpcode()) { default: @@ -3883,8 +3883,12 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { case AArch64::FSUBv2f64: case AArch64::FSUBv4f32: TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; - return (Options.UnsafeFPMath || - Options.AllowFPOpFusion == FPOpFusion::Fast); + // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by + // the target options or if FADD/FSUB has the contract fast-math flag. + return Options.UnsafeFPMath || + Options.AllowFPOpFusion == FPOpFusion::Fast || + Inst.getFlag(MachineInstr::FmContract); + return true; } return false; } @@ -6847,10 +6851,9 @@ Optional AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) return None; - Offset = MI.getOperand(2).getImm() * Sign; int Shift = MI.getOperand(3).getImm(); assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); - Offset = Offset << Shift; + Offset = Sign * (MI.getOperand(2).getImm() << Shift); } } return RegImmPair{MI.getOperand(1).getReg(), Offset}; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d4e984754d0e9..39e1ee3ad8c18 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -994,8 +994,8 @@ let Predicates = [HasPA] in { defm PAC : SignAuth<0b000, 0b010, "pac">; defm AUT : SignAuth<0b001, 0b011, "aut">; - def XPACI : SignAuthZero<0b100, 0b00, "xpaci">; - def XPACD : SignAuthZero<0b100, 0b01, "xpacd">; + def XPACI : ClearAuth<0, "xpaci">; + def XPACD : ClearAuth<1, "xpacd">; def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>; // Combined Instructions @@ -1030,7 +1030,7 @@ let Predicates = [HasPA] in { } // v8.3a floating point conversion for javascript -let Predicates = [HasJS, HasFPARMv8] in +let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, "fjcvtzs", [(set GPR32:$Rd, @@ -1039,7 +1039,7 @@ def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, } // HasJS, HasFPARMv8 // v8.4 Flag manipulation instructions -let Predicates = [HasFMI] in { +let Predicates = [HasFMI], Defs = [NZCV], Uses = [NZCV] in { def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> { let Inst{20-5} = 0b0000001000000000; } @@ -4483,6 +4483,10 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))), (FCVTPSv1i64 FPR64:$Rn)>; def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))), (FCVTPUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtzs (v1f64 FPR64:$Rn))), + (FCVTZSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtzu (v1f64 FPR64:$Rn))), + (FCVTZUv1i64 FPR64:$Rn)>; def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))), (FRECPEv1f16 FPR16:$Rn)>; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 83a488afc7972..62cc865fd1c36 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -40,6 +40,29 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) AArch64_MC::initLLVMToCVRegMapping(this); } +/// Return whether the register needs a CFI entry. Not all unwinders may know +/// about SVE registers, so we assume the lowest common denominator, i.e. the +/// callee-saves required by the base ABI. For the SVE registers z8-z15 only the +/// lower 64-bits (d8-d15) need to be saved. The lower 64-bits subreg is +/// returned in \p RegToUseForCFI. +bool AArch64RegisterInfo::regNeedsCFI(unsigned Reg, + unsigned &RegToUseForCFI) const { + if (AArch64::PPRRegClass.contains(Reg)) + return false; + + if (AArch64::ZPRRegClass.contains(Reg)) { + RegToUseForCFI = getSubReg(Reg, AArch64::dsub); + for (int I = 0; CSR_AArch64_AAPCS_SaveList[I]; ++I) { + if (CSR_AArch64_AAPCS_SaveList[I] == RegToUseForCFI) + return true; + } + return false; + } + + RegToUseForCFI = Reg; + return true; +} + static bool hasSVEArgsOrReturn(const MachineFunction *MF) { const Function &F = MF->getFunction(); return isa(F.getReturnType()) || diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 22a8ba76c6111..91064787d3dac 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -122,6 +122,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo { MachineFunction &MF) const override; unsigned getLocalAddressRegister(const MachineFunction &MF) const; + bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index bd05c56009a1d..54b351fda053b 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -133,6 +133,9 @@ def NZCV : AArch64Reg<0, "nzcv">; // First fault status register def FFR : AArch64Reg<0, "ffr">, DwarfRegNum<[47]>; +// Purely virtual Vector Granule (VG) Dwarf register +def VG : AArch64Reg<0, "vg">, DwarfRegNum<[46]>; + // GPR register classes with the intersections of GPR32/GPR32sp and // GPR64/GPR64sp for use by the coalescer. def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> { diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index dc501a9536b9a..cf34e5f6470da 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -174,22 +174,20 @@ def SDT_AArch64FMA : SDTypeProfile<1, 4, [ // Predicated operations with the result of inactive lanes being unspecified. def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>; +def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>; def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>; def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>; def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>; def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>; def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>; +def AArch64lsl_p : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>; +def AArch64lsr_p : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>; def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; +def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; +def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; - -// Merging op1 into the inactive lanes. -def AArch64smin_m1 : SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>; -def AArch64umin_m1 : SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>; -def AArch64smax_m1 : SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>; -def AArch64umax_m1 : SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>; -def AArch64lsl_m1 : SDNode<"AArch64ISD::SHL_MERGE_OP1", SDT_AArch64Arith>; -def AArch64lsr_m1 : SDNode<"AArch64ISD::SRL_MERGE_OP1", SDT_AArch64Arith>; -def AArch64asr_m1 : SDNode<"AArch64ISD::SRA_MERGE_OP1", SDT_AArch64Arith>; +def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; +def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; @@ -210,6 +208,19 @@ def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>; def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; +def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs), + [(setoge node:$lhs, node:$rhs), + (setge node:$lhs, node:$rhs)]>; +def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs), + [(setogt node:$lhs, node:$rhs), + (setgt node:$lhs, node:$rhs)]>; +def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs), + [(setoeq node:$lhs, node:$rhs), + (seteq node:$lhs, node:$rhs)]>; +def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs), + [(setone node:$lhs, node:$rhs), + (setne node:$lhs, node:$rhs)]>; + let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; @@ -274,10 +285,10 @@ let Predicates = [HasSVE] in { defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>; defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>; - defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>; - defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>; - defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>; - defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>; + defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_p>; + defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_p>; + defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_p>; + defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_p>; defm MUL_ZI : sve_int_arith_imm2<"mul", mul>; defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>; @@ -330,12 +341,17 @@ let Predicates = [HasSVE] in { defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>; defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>; - defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>; - defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>; - defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>; - defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>; - defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>; - defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>; + defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>; + defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>; + defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>; + defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", "UMIN_ZPZZ", int_aarch64_sve_umin, DestructiveBinaryComm>; + defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", "SABD_ZPZZ", int_aarch64_sve_sabd, DestructiveBinaryComm>; + defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", "UABD_ZPZZ", int_aarch64_sve_uabd, DestructiveBinaryComm>; + + defm SMAX_ZPZZ : sve_int_bin_pred_bhsd; + defm UMAX_ZPZZ : sve_int_bin_pred_bhsd; + defm SMIN_ZPZZ : sve_int_bin_pred_bhsd; + defm UMIN_ZPZZ : sve_int_bin_pred_bhsd; defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>; defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>; @@ -1172,10 +1188,10 @@ multiclass sve_prefetch; defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; - defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>; + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>; defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; @@ -1300,9 +1316,9 @@ multiclass sve_prefetch; // Unpredicated shifts - defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>; - defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>; - defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>; + defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_p>; + defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>; + defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>; defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; @@ -1315,19 +1331,23 @@ multiclass sve_prefetch; let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { - defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; - defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; - defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd; defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; } - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">; - defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">; - defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">; + defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">; + defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">; + defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">; defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>; defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>; defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>; + defm ASR_ZPZZ : sve_int_bin_pred_bhsd; + defm LSR_ZPZZ : sve_int_bin_pred_bhsd; + defm LSL_ZPZZ : sve_int_bin_pred_bhsd; + defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>; diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h index 6fa1c744f77e2..24751a81797d3 100644 --- a/llvm/lib/Target/AArch64/AArch64StackOffset.h +++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h @@ -123,6 +123,18 @@ class StackOffset { } } + void getForDwarfOffset(int64_t &ByteSized, int64_t &VGSized) const { + assert(isValid() && "Invalid frame offset"); + + // VGSized offsets are divided by '2', because the VG register is the + // the number of 64bit granules as opposed to 128bit vector chunks, + // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. + // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. + // VG = n * 2 and the dwarf offset must be VG * 8 bytes. + ByteSized = Bytes; + VGSized = ScalableBytes / 2; + } + /// Returns whether the offset is known zero. explicit operator bool() const { return Bytes || ScalableBytes; } diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 61f27cbc3b29d..1b8c6417be38d 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -59,7 +59,7 @@ using namespace llvm; -#define DEBUG_TYPE "stack-tagging" +#define DEBUG_TYPE "aarch64-stack-tagging" static cl::opt ClMergeInit( "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore, @@ -544,7 +544,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { MapVector Allocas; // need stable iteration order SmallVector RetVec; - DenseMap AllocaForValue; SmallVector UnrecognizedLifetimes; for (auto &BB : *F) { @@ -566,8 +565,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { auto *II = dyn_cast(I); if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end)) { - AllocaInst *AI = - llvm::findAllocaForValue(II->getArgOperand(1), AllocaForValue); + AllocaInst *AI = findAllocaForValue(II->getArgOperand(1)); if (!AI) { UnrecognizedLifetimes.push_back(I); continue; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index b0cef9b66e017..819f7b614106e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -148,10 +148,10 @@ static cl::opt EnableGlobalISelAtO( cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(0)); -static cl::opt EnableSVEIntrinsicOpts( - "aarch64-sve-intrinsic-opts", cl::Hidden, - cl::desc("Enable SVE intrinsic opts"), - cl::init(true)); +static cl::opt + EnableSVEIntrinsicOpts("aarch64-enable-sve-intrinsic-opts", cl::Hidden, + cl::desc("Enable SVE intrinsic opts"), + cl::init(true)); static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -457,6 +457,7 @@ void AArch64PassConfig::addIRPasses() { .forwardSwitchCondToPhi(true) .convertSwitchToLookupTable(true) .needCanonicalLoops(false) + .hoistCommonInsts(true) .sinkCommonInsts(true))); // Run LoopDataPrefetch diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 7738a42293919..25e6261343179 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -57,6 +57,12 @@ class AArch64TargetMachine : public LLVMTargetMachine { SMDiagnostic &Error, SMRange &SourceRange) const override; + /// Returns true if a cast between SrcAS and DestAS is a noop. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } + private: bool isLittle; }; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index cf6de797727be..5f5da63b21b64 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -270,6 +270,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, } int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -306,7 +307,8 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); static const TypeConversionCostTblEntry ConversionTbl[] = { @@ -410,7 +412,8 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, SrcTy.getSimpleVT())) return AdjustCost(Entry->Cost); - return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, @@ -442,12 +445,14 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, // we may get the extension for free. If not, get the default cost for the // extend. if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) - return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); + return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, + CostKind); // The destination type should be larger than the element type. If not, get // the default cost for the extend. if (DstVT.getSizeInBits() < SrcVT.getSizeInBits()) - return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); + return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, + CostKind); switch (Opcode) { default: @@ -466,7 +471,8 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, } // If we are unable to perform the extend for free, get the default cost. - return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind); + return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, + CostKind); } unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 1f029689a60e6..05b7f70f2335c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -114,7 +114,7 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, @@ -166,6 +166,9 @@ class AArch64TTIImpl : public BasicTTIImplBase { return false; Type *Ty = cast(DataType)->getElementType(); + if (Ty->isPointerTy()) + return true; + if (Ty->isBFloatTy() || Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 0ac09c4f96f04..beae2b059bec5 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -5089,12 +5089,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, SmallVector &RequestedExtensions) { - const bool NoCrypto = - (std::find(RequestedExtensions.begin(), RequestedExtensions.end(), - "nocrypto") != std::end(RequestedExtensions)); - const bool Crypto = - (std::find(RequestedExtensions.begin(), RequestedExtensions.end(), - "crypto") != std::end(RequestedExtensions)); + const bool NoCrypto = llvm::is_contained(RequestedExtensions, "nocrypto"); + const bool Crypto = llvm::is_contained(RequestedExtensions, "crypto"); if (!NoCrypto && Crypto) { switch (ArchKind) { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index eb6a4aa3d8266..9f7950851f65c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -133,6 +133,8 @@ class AArch64InstructionSelector : public InstructionSelector { bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool tryOptShuffleDupLane(MachineInstr &I, LLT DstTy, LLT SrcTy, + ArrayRef Mask, MachineRegisterInfo &MRI) const; bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; @@ -2304,12 +2306,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { I.addOperand(MachineOperand::CreateImm(Offset)); // If we're storing a 0, use WZR/XZR. - if (auto CVal = getConstantVRegVal(ValReg, MRI)) { - if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) { - if (I.getOpcode() == AArch64::STRWui) + if (Opcode == TargetOpcode::G_STORE) { + auto CVal = getConstantVRegValWithLookThrough( + ValReg, MRI, /*LookThroughInstrs = */ true, + /*HandleFConstants = */ false); + if (CVal && CVal->Value == 0) { + unsigned Opc = I.getOpcode(); + switch (Opc) { + case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: I.getOperand(0).setReg(AArch64::WZR); - else if (I.getOpcode() == AArch64::STRXui) + break; + case AArch64::STRXui: I.getOperand(0).setReg(AArch64::XZR); + break; + } } } @@ -4296,6 +4308,67 @@ MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare( return &*CmpMI; } +bool AArch64InstructionSelector::tryOptShuffleDupLane( + MachineInstr &I, LLT DstTy, LLT SrcTy, ArrayRef Mask, + MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + + // We assume that scalar->vector splats have been been handled in the + // post-legalizer combiner to G_DUP. However splats of a source vector's + // lane don't fit that pattern, detect it here: + // %res = G_SHUFFLE_VECTOR %src:, undef, splat(lane-idx) + // => + // %res = DUPv[N][Ty]lane %src, lane-idx + // FIXME: this case should be covered by re-implementing the perfect shuffle + // codegen mechanism. + + auto LaneIdx = getSplatIndex(I); + if (!LaneIdx) + return false; + + // The lane idx should be within the first source vector. + if (*LaneIdx >= SrcTy.getNumElements()) + return false; + + if (DstTy != SrcTy) + return false; + + LLT ScalarTy = SrcTy.getElementType(); + unsigned ScalarSize = ScalarTy.getSizeInBits(); + + unsigned Opc = 0; + switch (SrcTy.getNumElements()) { + case 2: + if (ScalarSize == 64) + Opc = AArch64::DUPv2i64lane; + break; + case 4: + if (ScalarSize == 32) + Opc = AArch64::DUPv4i32lane; + break; + case 8: + if (ScalarSize == 16) + Opc = AArch64::DUPv8i16lane; + break; + case 16: + if (ScalarSize == 8) + Opc = AArch64::DUPv16i8lane; + break; + default: + break; + } + if (!Opc) + return false; + + MachineIRBuilder MIB(I); + auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, + {I.getOperand(1).getReg()}) + .addImm(*LaneIdx); + constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectShuffleVector( MachineInstr &I, MachineRegisterInfo &MRI) const { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); @@ -4317,6 +4390,9 @@ bool AArch64InstructionSelector::selectShuffleVector( return false; } + if (tryOptShuffleDupLane(I, DstTy, Src1Ty, Mask, MRI)) + return true; + unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; SmallVector CstIdxs; @@ -5073,11 +5149,59 @@ InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, unsigned SizeInBytes) const { MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - - // If we have a constant offset, then we probably don't want to match a - // register offset. - if (isBaseWithConstantOffset(Root, MRI)) + if (!Root.isReg()) return None; + MachineInstr *PtrAdd = + getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); + if (!PtrAdd) + return None; + + // Check for an immediates which cannot be encoded in the [base + imm] + // addressing mode, and can't be encoded in an add/sub. If this happens, we'll + // end up with code like: + // + // mov x0, wide + // add x1 base, x0 + // ldr x2, [x1, x0] + // + // In this situation, we can use the [base, xreg] addressing mode to save an + // add/sub: + // + // mov x0, wide + // ldr x2, [base, x0] + auto ValAndVReg = + getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); + if (ValAndVReg) { + unsigned Scale = Log2_32(SizeInBytes); + int64_t ImmOff = ValAndVReg->Value; + + // Skip immediates that can be selected in the load/store addresing + // mode. + if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && + ImmOff < (0x1000 << Scale)) + return None; + + // Helper lambda to decide whether or not it is preferable to emit an add. + auto isPreferredADD = [](int64_t ImmOff) { + // Constants in [0x0, 0xfff] can be encoded in an add. + if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) + return true; + + // Can it be encoded in an add lsl #12? + if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) + return false; + + // It can be encoded in an add lsl #12, but we may not want to. If it is + // possible to select this as a single movz, then prefer that. A single + // movz is faster than an add with a shift. + return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && + (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; + }; + + // If the immediate can be encoded in a single add/sub, then bail out. + if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) + return None; + } // Try to fold shifts into the addressing mode. auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 2eaec0b970fa6..063c451440dc1 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -62,7 +62,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) } getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) - .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) + .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64, v16s8, v8s16}) .clampScalar(0, s1, s64) .widenScalarToNextPow2(0, 8) .fewerElementsIf( @@ -156,7 +156,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, - G_FNEARBYINT}) + G_FNEARBYINT, G_INTRINSIC_LRINT}) // If we don't have full FP16 support, then scalarize the elements of // vectors containing fp16 types. .fewerElementsIf( @@ -351,7 +351,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) if (DstSize == 128 && !Query.Types[0].isVector()) return false; // Extending to a scalar s128 needs narrowing. - + // Make sure that we have something that will fit in a register, and // make sure it's a power of 2. if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) @@ -399,7 +399,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) .clampScalar(1, s32, s64) - .widenScalarToNextPow2(1) + .minScalarSameAs(1, 0) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0); @@ -596,7 +596,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // to be the same size as the dest. if (DstTy != SrcTy) return false; - for (auto &Ty : {v2s32, v4s32, v2s64}) { + for (auto &Ty : {v2s32, v4s32, v2s64, v16s8, v8s16}) { if (DstTy == Ty) return true; } @@ -676,6 +676,27 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI, // Set the regclass on the dest reg too. MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); + // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so + // by creating a MOVK that sets bits 48-63 of the register to (global address + // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to + // prevent an incorrect tag being generated during relocation when the the + // global appears before the code section. Without the offset, a global at + // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced + // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = + // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` + // instead of `0xf`. + // This assumes that we're in the small code model so we can assume a binary + // size of <= 4GB, which makes the untagged PC relative offset positive. The + // binary must also be loaded into address range [0, 2^48). Both of these + // properties need to be ensured at runtime when using tagged addresses. + if (OpFlags & AArch64II::MO_TAGGED) { + ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) + .addGlobalAddress(GV, 0x100000000, + AArch64II::MO_PREL | AArch64II::MO_G3) + .addImm(48); + MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); + } + MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index baa8515baf3ea..15fa5cae2c424 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -43,29 +44,6 @@ struct ShuffleVectorPseudo { ShuffleVectorPseudo() {} }; -/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat. -/// If \p MI is not a splat, returns None. -static Optional getSplatIndex(MachineInstr &MI) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && - "Only G_SHUFFLE_VECTOR can have a splat index!"); - ArrayRef Mask = MI.getOperand(3).getShuffleMask(); - auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; }); - - // If all elements are undefined, this shuffle can be considered a splat. - // Return 0 for better potential for callers to simplify. - if (FirstDefinedIdx == Mask.end()) - return 0; - - // Make sure all remaining elements are either undef or the same - // as the first non-undef value. - int SplatValue = *FirstDefinedIdx; - if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()), - [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; })) - return None; - - return SplatValue; -} - /// Check if a vector shuffle corresponds to a REV instruction with the /// specified blocksize. static bool isREVMask(ArrayRef M, unsigned EltSize, unsigned NumElts, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 9f7dfdf624829..dc44980ce218c 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -88,8 +88,6 @@ class AArch64AsmBackend : public MCAsmBackend { uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const override; - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; @@ -463,11 +461,6 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, } } -bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const { - return false; -} - bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index b0f414bd27edd..012661edbbfda 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -373,7 +373,11 @@ void AArch64MachObjectWriter::recordRelocation( Type == MachO::ARM64_RELOC_PAGE21 || Type == MachO::ARM64_RELOC_PAGEOFF12) && Value) { - assert((Value & 0xff000000) == 0 && "Added relocation out of range!"); + if (!isInt<24>(Value)) { + Asm.getContext().reportError(Fixup.getLoc(), + "addend too big for relocation"); + return; + } MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index ee36ac0168003..c5dd327d2aef4 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2382,11 +2382,19 @@ multiclass sve_int_bin_pred_arit_0 opc, string asm, string Ps, def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_arit_1 opc, string asm, SDPatternOperator op> { - def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_1 opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>, + SVEPseudo2Instr; + def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>, + SVEPseudo2Instr; + def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>, + SVEPseudo2Instr; + } def : SVE_3_Op_Pat(NAME # _B)>; def : SVE_3_Op_Pat(NAME # _H)>; @@ -7728,9 +7736,13 @@ multiclass sve_mem_ldor_si sz, string asm, RegisterOperand listty, (!cast(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; // Base addressing mode - def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), - (!cast(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; - + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)), + (!cast(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>; + let AddedComplexity = 2 in { + // Reg + Imm addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))), + (!cast(NAME) $Pg, $base, simm4s32:$imm)>; + } } class sve_mem_ldor_ss sz, string asm, RegisterOperand VecList, diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index 74fe0cdd1ea7f..542d2c9645150 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -37,7 +37,7 @@ using namespace llvm; using namespace llvm::PatternMatch; -#define DEBUG_TYPE "sve-intrinsic-opts" +#define DEBUG_TYPE "aarch64-sve-intrinsic-opts" namespace llvm { void initializeSVEIntrinsicOptsPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 88c79665be60d..251e12ee09f25 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -281,8 +281,6 @@ enum TargetIndex { }; } -} // End namespace llvm - /// OpenCL uses address spaces to differentiate between /// various memory regions on the hardware. On the CPU /// all of the address spaces point to the same memory, @@ -339,4 +337,17 @@ namespace AMDGPUAS { }; } +namespace AMDGPU { + +// FIXME: Missing constant_32bit +inline bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS || + AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; +} +} + +} // End namespace llvm + #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index bb2aba0449748..90e7a39b30d79 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -96,7 +96,7 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) return true; - const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); + const Value *Base = getUnderlyingObject(Loc.Ptr); AS = Base->getType()->getPointerAddressSpace(); if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index b09e92c07f9ba..45f515c5115ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -131,10 +131,20 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { // We're tracking up to the Function boundaries, and cannot go beyond because // of FunctionPass restrictions. We can ensure that is memory not clobbered // for memory operations that are live in to entry points only. - bool NotClobbered = isEntryFunc && !isClobberedInFunction(&I); Instruction *PtrI = dyn_cast(Ptr); - if (!PtrI && NotClobbered && isGlobalLoad(I)) { - if (isa(Ptr) || isa(Ptr)) { + + if (!isEntryFunc) { + if (PtrI) + setUniformMetadata(PtrI); + return; + } + + bool NotClobbered = false; + if (PtrI) + NotClobbered = !isClobberedInFunction(&I); + else if (isa(Ptr) || isa(Ptr)) { + if (isGlobalLoad(I) && !isClobberedInFunction(&I)) { + NotClobbered = true; // Lookup for the existing GEP if (noClobberClones.count(Ptr)) { PtrI = noClobberClones[Ptr]; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index a72d100b13075..9bf3653006006 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -993,7 +993,7 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, Register InputReg = MRI.createGenericVirtualRegister(ArgTy); if (IncomingArg) { - LI->loadInputValue(InputReg, MIRBuilder, IncomingArg); + LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); } else { assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); @@ -1026,13 +1026,16 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, if (!OutgoingArg) return false; - const ArgDescriptor *IncomingArgX = std::get<0>( - CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X)); - const ArgDescriptor *IncomingArgY = std::get<0>( - CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); - const ArgDescriptor *IncomingArgZ = std::get<0>( - CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); + auto WorkitemIDX = + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + auto WorkitemIDY = + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + auto WorkitemIDZ = + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX); + const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY); + const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ); const LLT S32 = LLT::scalar(32); // If incoming ids are not packed we need to pack them. @@ -1040,12 +1043,14 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, Register InputReg; if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { InputReg = MRI.createGenericVirtualRegister(S32); - LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX); + LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, + std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); } if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { Register Y = MRI.createGenericVirtualRegister(S32); - LI->loadInputValue(Y, MIRBuilder, IncomingArgY); + LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), + std::get<2>(WorkitemIDY)); Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; @@ -1053,7 +1058,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { Register Z = MRI.createGenericVirtualRegister(S32); - LI->loadInputValue(Z, MIRBuilder, IncomingArgZ); + LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), + std::get<2>(WorkitemIDZ)); Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; @@ -1067,7 +1073,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, ArgDescriptor IncomingArg = ArgDescriptor::createArg( IncomingArgX ? *IncomingArgX : IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); - LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg); + LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, + &AMDGPU::VGPR_32RegClass, S32); } if (OutgoingArg->isRegister()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index d11b9801b9716..4d78a4f063209 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -22,7 +22,7 @@ namespace llvm { class AMDGPUTargetLowering; class MachineInstrBuilder; -class AMDGPUCallLowering: public CallLowering { +class AMDGPUCallLowering final : public CallLowering { void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy, uint64_t Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 3f12addbcc79b..056f91db24ff6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,6 +51,11 @@ def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; +// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods? +def gi_vop3opsel : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_smrd_imm : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5539f4e8699db..368af73b18d3c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -716,8 +716,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX || - Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) { + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f7cdc7461eacd..1f5d83d379495 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3795,8 +3795,15 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); if (Res.getOpcode() != AMDGPUISD::FMED3) return SDValue(); // Op got folded away. - if (!N0.hasOneUse()) - DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + + if (!N0.hasOneUse()) { + SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res); + DAG.ReplaceAllUsesWith(N0, Neg); + + for (SDNode *U : Neg->uses()) + DCI.AddToWorklist(U); + } + return Res; } case ISD::FP_EXTEND: @@ -4336,7 +4343,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) - NODE_NAME_CASE(ATOMIC_LOAD_CSUB) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 85f23c81db170..e995fdcd29ce4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -508,7 +508,6 @@ enum NodeType : unsigned { ATOMIC_DEC, ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, - ATOMIC_LOAD_CSUB, BUFFER_LOAD, BUFFER_LOAD_UBYTE, BUFFER_LOAD_USHORT, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp index 3b5d91133a2f8..04c41eb368327 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -134,7 +134,7 @@ unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const { Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) continue; - PtrArg = GetUnderlyingObject(PtrArg, DL); + PtrArg = getUnderlyingObject(PtrArg); if (const AllocaInst *AI = dyn_cast(PtrArg)) { if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) continue; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 58f251fec5e98..e186f66a83536 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -723,12 +723,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { /// /// Note: This only supports non-TFE/LWE image intrinsic calls; those have /// struct returns. -Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, - IntrinsicInst &II, - APInt DemandedElts, - int DMaskIdx = -1) { +static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, + IntrinsicInst &II, + APInt DemandedElts, + int DMaskIdx = -1) { - auto *IIVTy = cast(II.getType()); + auto *IIVTy = cast(II.getType()); unsigned VWidth = IIVTy->getNumElements(); if (VWidth == 1) return nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a637442002932..16fc759f0cbf6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -170,19 +170,6 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) return false; - // Don't constrain the source register to a class so the def instruction - // handles it (unless it's undef). - // - // FIXME: This is a hack. When selecting the def, we neeed to know - // specifically know that the result is VCCRegBank, and not just an SGPR - // with size 1. An SReg_32 with size 1 is ambiguous with wave32. - if (Src.isUndef()) { - const TargetRegisterClass *SrcRC = - TRI.getConstrainedRegClassForOperand(Src, *MRI); - if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) - return false; - } - return true; } @@ -286,50 +273,24 @@ static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { } bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { - MachineOperand &Dst = I.getOperand(0); - MachineOperand &Src0 = I.getOperand(1); - MachineOperand &Src1 = I.getOperand(2); - Register DstReg = Dst.getReg(); + Register DstReg = I.getOperand(0).getReg(); unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); - if (DstRB->getID() == AMDGPU::VCCRegBankID) { - const TargetRegisterClass *RC = TRI.getBoolRC(); - unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), - RC == &AMDGPU::SReg_64RegClass); - I.setDesc(TII.get(InstOpc)); - // Dead implicit-def of scc - I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef - true, // isImp - false, // isKill - true)); // isDead - - // FIXME: Hack to avoid turning the register bank into a register class. - // The selector for G_ICMP relies on seeing the register bank for the result - // is VCC. In wave32 if we constrain the registers to SReg_32 here, it will - // be ambiguous whether it's a scalar or vector bool. - if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg())) - MRI->setRegClass(Src0.getReg(), RC); - if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg())) - MRI->setRegClass(Src1.getReg(), RC); - - return RBI.constrainGenericRegister(DstReg, *RC, *MRI); - } - - // TODO: Should this allow an SCC bank result, and produce a copy from SCC for - // the result? - if (DstRB->getID() == AMDGPU::SGPRRegBankID) { - unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32); - I.setDesc(TII.get(InstOpc)); - // Dead implicit-def of scc - I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef - true, // isImp - false, // isKill - true)); // isDead - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } + if (DstRB->getID() != AMDGPU::SGPRRegBankID && + DstRB->getID() != AMDGPU::VCCRegBankID) + return false; - return false; + bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && + STI.isWave64()); + I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); + + // Dead implicit-def of scc + I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef + true, // isImp + false, // isKill + true)); // isDead + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { @@ -616,11 +577,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { return true; } -static bool isZero(Register Reg, const MachineRegisterInfo &MRI) { - int64_t Val; - return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0; -} - bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( MachineInstr &MI) const { if (selectImpl(MI, *CoverageInfo)) @@ -645,6 +601,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock *BB = MI.getParent(); + auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); + if (ConstSrc1) { + auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); + if (ConstSrc0) { + uint32_t Lo16 = static_cast(ConstSrc0->Value) & 0xffff; + uint32_t Hi16 = static_cast(ConstSrc1->Value) & 0xffff; + + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) + .addImm(Lo16 | (Hi16 << 16)); + MI.eraseFromParent(); + return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); + } + } + // TODO: This should probably be a combine somewhere // (build_vector_trunc $src0, undef -> copy $src0 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); @@ -686,7 +656,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( } else if (Shift1) { Opc = AMDGPU::S_PACK_LH_B32_B16; MI.getOperand(2).setReg(ShiftSrc1); - } else if (Shift0 && isZero(Src1, *MRI)) { + } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) .addReg(ShiftSrc0) @@ -736,6 +706,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { if (Offset % 32 != 0 || InsSize % 32 != 0) return false; + // Currently not handled by getSubRegFromChannel. + if (InsSize > 128) + return false; + unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); if (SubReg == AMDGPU::NoSubRegister) return false; @@ -891,6 +865,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return selectIntrinsicIcmp(I); case Intrinsic::amdgcn_ballot: return selectBallot(I); + case Intrinsic::amdgcn_reloc_constant: + return selectRelocConstant(I); default: return selectImpl(I, *CoverageInfo); } @@ -1071,6 +1047,31 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { + Register DstReg = I.getOperand(0).getReg(); + const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); + if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) + return false; + + const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; + + Module *M = MF->getFunction().getParent(); + const MDNode *Metadata = I.getOperand(2).getMetadata(); + auto SymbolName = cast(Metadata->getOperand(0))->getString(); + auto RelocSymbol = cast( + M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); + + MachineBasicBlock *BB = I.getParent(); + BuildMI(*BB, &I, I.getDebugLoc(), + TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) + .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); + + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick // SelectionDAG uses for wave32 vs wave64. @@ -1300,12 +1301,15 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) .addReg(PtrBase); - BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) + if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) + return false; + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) .addImm(Offset) .addImm(IsGDS ? -1 : 0) .cloneMemRefs(MI); MI.eraseFromParent(); - return true; + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, @@ -1851,12 +1855,33 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { if (!DstTy.isScalar()) return false; - if (I.getOpcode() == AMDGPU::G_ANYEXT) - return selectCOPY(I); - // Artifact casts should never use vcc. const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); + // FIXME: This should probably be illegal and split earlier. + if (I.getOpcode() == AMDGPU::G_ANYEXT) { + if (DstSize <= 32) + return selectCOPY(I); + + const TargetRegisterClass *SrcRC = + TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI); + const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + + Register UndefReg = MRI->createVirtualRegister(SrcRC); + BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); + BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(SrcReg) + .addImm(AMDGPU::sub0) + .addReg(UndefReg) + .addImm(AMDGPU::sub1); + I.eraseFromParent(); + + return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && + RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); + } + if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { // 64-bit should have been split up in RegBankSelect @@ -1948,7 +1973,7 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); ImmOp.ChangeToImmediate(Imm.getZExtValue()); } else if (ImmOp.isCImm()) { - ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); + ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); } Register DstReg = I.getOperand(0).getReg(); @@ -2274,8 +2299,7 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { CondPhysReg = AMDGPU::SCC; BrOpcode = AMDGPU::S_CBRANCH_SCC1; - // FIXME: Hack for isSCC tests - ConstrainRC = &AMDGPU::SGPR_32RegClass; + ConstrainRC = &AMDGPU::SReg_32RegClass; } else { // FIXME: Do we have to insert an and with exec here, like in SelectionDAG? // We sort of know that a VCC producer based on the register bank, that ands @@ -2499,7 +2523,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) .addReg(IdxReg); BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) - .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, 0, SubReg) .addReg(SrcReg, RegState::Implicit); MI.eraseFromParent(); return true; @@ -2509,7 +2533,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( .addReg(IdxReg) .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg) - .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, 0, SubReg) .addReg(SrcReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); @@ -2830,6 +2854,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_UMIN: case TargetOpcode::G_ATOMICRMW_UMAX: case TargetOpcode::G_ATOMICRMW_FADD: + case AMDGPU::G_AMDGPU_ATOMIC_INC: + case AMDGPU::G_AMDGPU_ATOMIC_DEC: return selectG_LOAD_ATOMICRMW(I); case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: return selectG_AMDGPU_ATOMIC_CMPXCHG(I); @@ -2859,10 +2885,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return selectG_INSERT_VECTOR_ELT(I); case TargetOpcode::G_SHUFFLE_VECTOR: return selectG_SHUFFLE_VECTOR(I); - case AMDGPU::G_AMDGPU_ATOMIC_INC: - case AMDGPU::G_AMDGPU_ATOMIC_DEC: - initM0(I); - return selectImpl(I, *CoverageInfo); case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { const AMDGPU::ImageDimIntrinsicInfo *Intr diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 1fe80958917d6..1a0a9cefacf98 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -47,7 +47,7 @@ class SIInstrInfo; class SIMachineFunctionInfo; class SIRegisterInfo; -class AMDGPUInstructionSelector : public InstructionSelector { +class AMDGPUInstructionSelector final : public InstructionSelector { private: MachineRegisterInfo *MRI; @@ -108,6 +108,7 @@ class AMDGPUInstructionSelector : public InstructionSelector { bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicIcmp(MachineInstr &MI) const; bool selectBallot(MachineInstr &I) const; + bool selectRelocConstant(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I) const; bool selectEndCfIntrinsic(MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 80633bbc6179d..b40870024cc49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -70,6 +70,13 @@ static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { }; } +static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + return Ty.getSizeInBits() % 32 == 0; + }; +} + static LegalityPredicate isWideVec16(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; @@ -115,20 +122,32 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +static LLT getBitcastRegisterType(const LLT Ty) { + const unsigned Size = Ty.getSizeInBits(); + + LLT CoercedTy; + if (Size <= 32) { + // <2 x s8> -> s16 + // <4 x s8> -> s32 + return LLT::scalar(Size); + } + + return LLT::scalarOrVector(Size / 32, 32); +} + static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; - unsigned Size = Ty.getSizeInBits(); - - LLT CoercedTy; - if (Size <= 32) { - // <2 x s8> -> s16 - // <4 x s8> -> s32 - CoercedTy = LLT::scalar(Size); - } else - CoercedTy = LLT::scalarOrVector(Size / 32, 32); + return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); + }; +} - return std::make_pair(TypeIdx, CoercedTy); +static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + unsigned Size = Ty.getSizeInBits(); + assert(Size % 32 == 0); + return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); }; } @@ -319,6 +338,20 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, !loadStoreBitcastWorkaround(Ty); } +/// Return true if a load or store of the type should be lowered with a bitcast +/// to a different type. +static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, + const unsigned MemSizeInBits) { + const unsigned Size = Ty.getSizeInBits(); + if (Size != MemSizeInBits) + return Size <= 32 && Ty.isVector(); + + if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) + return true; + return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && + !isRegisterVectorElementType(Ty.getElementType()); +} + AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const GCNTargetMachine &TM) : ST(ST_) { @@ -415,14 +448,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor(AllS64Vectors) .legalFor(AddrSpaces64) .legalFor(AddrSpaces32) + .legalIf(isPointer(0)) .clampScalar(0, S32, S256) .widenScalarToNextPow2(0, 32) .clampMaxNumElements(0, S32, 16) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .legalIf(isPointer(0)); + .scalarize(0); - if (ST.hasVOP3PInsts()) { - assert(ST.hasIntClamp() && "all targets with VOP3P should support clamp"); + if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { + // Full set of gfx9 features. getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16, V2S16}) .clampScalar(0, S16, S32) @@ -431,7 +465,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) - .lowerFor({S32, S16, V2S16}) // FIXME: legal and merge with add/sub/mul + .legalFor({S32, S16, V2S16}) // Clamp modifier .minScalar(0, S16) .clampMaxNumElements(0, S16, 2) .scalarize(0) @@ -444,12 +478,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 - assert(ST.hasIntClamp() && "all targets with 16-bit should support clamp"); - // Technically the saturating operations require clamp bit support, but this // was introduced at the same time as 16-bit operations. getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .lowerFor({S32, S16}) // FIXME: legal with clamp modifier + .legalFor({S32, S16}) // Clamp modifier .minScalar(0, S16) .scalarize(0) .widenScalarToNextPow2(0, 16) @@ -469,7 +501,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasIntClamp()) { getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .lowerFor({S32}) // FIXME: legal with clamp modifier. + .legalFor({S32}) // Clamp modifier. .scalarize(0) .minScalarOrElt(0, S32) .lower(); @@ -481,6 +513,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .lower(); } + // FIXME: DAG expansion gets better results. The widening uses the smaller + // range values and goes for the min/max lowering directly. getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) .minScalar(0, S32) .scalarize(0) @@ -524,9 +558,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_CONSTANT) .legalFor({S1, S32, S64, S16, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) + .legalIf(isPointer(0)) .clampScalar(0, S32, S64) - .widenScalarToNextPow2(0) - .legalIf(isPointer(0)); + .widenScalarToNextPow2(0); getActionDefinitionsBuilder(G_FCONSTANT) .legalFor({S32, S64, S16}) @@ -550,8 +584,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({{PrivatePtr, S32}}); getActionDefinitionsBuilder(G_GLOBAL_VALUE) - .unsupportedFor({PrivatePtr}) - .custom(); + .customIf(typeIsNot(0, PrivatePtr)); + setAction({G_BLOCK_ADDR, CodePtr}, Legal); auto &FPOpActions = getActionDefinitionsBuilder( @@ -710,7 +744,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); - getActionDefinitionsBuilder(G_INTRINSIC_ROUND) + // Lower roundeven into G_FRINT + getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) .scalarize(0) .lower(); @@ -732,10 +767,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } - // FIXME: Clamp offset operand. getActionDefinitionsBuilder(G_PTR_ADD) - .legalIf(isPointer(0)) - .scalarize(0); + .legalIf(all(isPointer(0), sameSize(0, 1))) + .scalarize(0) + .scalarSameSizeAs(1, 0); getActionDefinitionsBuilder(G_PTRMASK) .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) @@ -1030,16 +1065,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // 16-bit vector parts. Actions.bitcastIf( [=](const LegalityQuery &Query) -> bool { - const LLT Ty = Query.Types[0]; - const unsigned Size = Ty.getSizeInBits(); - - if (Size != Query.MMODescrs[0].SizeInBits) - return Size <= 32 && Ty.isVector(); - - if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) - return true; - return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && - !isRegisterVectorElementType(Ty.getElementType()); + return shouldBitcastLoadStoreType(ST, Query.Types[0], + Query.MMODescrs[0].SizeInBits); }, bitcastToRegisterType(0)); Actions @@ -1196,14 +1223,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, G_ATOMICRMW_UMIN}) .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, - {S64, GlobalPtr}, {S64, LocalPtr}}); + {S64, GlobalPtr}, {S64, LocalPtr}, + {S32, RegionPtr}, {S64, RegionPtr}}); if (ST.hasFlatAddressSpace()) { Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } if (ST.hasLDSFPAtomics()) { getActionDefinitionsBuilder(G_ATOMICRMW_FADD) - .legalFor({{S32, LocalPtr}}); + .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); } // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output @@ -1276,15 +1304,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT EltTy = Query.Types[EltTypeIdx]; const LLT VecTy = Query.Types[VecTypeIdx]; const LLT IdxTy = Query.Types[IdxTypeIdx]; - return (EltTy.getSizeInBits() == 16 || - EltTy.getSizeInBits() % 32 == 0) && - VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= MaxRegisterSize && - IdxTy.getSizeInBits() == 32; + const unsigned EltSize = EltTy.getSizeInBits(); + return (EltSize == 32 || EltSize == 64) && + VecTy.getSizeInBits() % 32 == 0 && + VecTy.getSizeInBits() <= MaxRegisterSize && + IdxTy.getSizeInBits() == 32; + }) + .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)), + bitcastToVectorElement32(1)) + //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) + .bitcastIf( + all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)), + [=](const LegalityQuery &Query) { + // For > 64-bit element types, try to turn this into a 64-bit + // element vector since we may be able to do better indexing + // if this is scalar. If not, fall back to 32. + const LLT EltTy = Query.Types[EltTypeIdx]; + const LLT VecTy = Query.Types[VecTypeIdx]; + const unsigned DstEltSize = EltTy.getSizeInBits(); + const unsigned VecSize = VecTy.getSizeInBits(); + + const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; + return std::make_pair( + VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); }) .clampScalar(EltTypeIdx, S32, S64) .clampScalar(VecTypeIdx, S32, S64) - .clampScalar(IdxTypeIdx, S32, S32); + .clampScalar(IdxTypeIdx, S32, S32) + // TODO: Clamp the number of elements before resorting to stack lowering. + // It should only be necessary with variable indexes. + // As a last resort, lower to the stack + .lower(); } getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) @@ -1489,11 +1539,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_READCYCLECOUNTER) .legalFor({S64}); + getActionDefinitionsBuilder(G_FENCE) + .alwaysLegal(); + getActionDefinitionsBuilder({ // TODO: Verify V_BFI_B32 is generated from expanded bit ops G_FCOPYSIGN, G_ATOMIC_CMPXCHG_WITH_SUCCESS, + G_ATOMICRMW_NAND, + G_ATOMICRMW_FSUB, G_READ_REGISTER, G_WRITE_REGISTER, @@ -1623,8 +1678,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register QueuePtr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) + if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return Register(); // Offset into amd_queue_t for group_segment_aperture_base_hi / @@ -1666,8 +1720,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( const AMDGPUTargetMachine &TM = static_cast(MF.getTarget()); - const GCNSubtarget &ST = MF.getSubtarget(); - if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { + if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); return true; } @@ -2240,8 +2293,7 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( Register CmpVal = MI.getOperand(2).getReg(); Register NewVal = MI.getOperand(3).getReg(); - assert(SITargetLowering::isFlatGlobalAddrSpace( - MRI.getType(PtrReg).getAddressSpace()) && + assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"); LLT ValTy = MRI.getType(CmpVal); @@ -2489,33 +2541,16 @@ Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, return insertLiveInCopy(B, MRI, LiveIn, PhyReg); } -const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( - MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { - const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); - const ArgDescriptor *Arg; - const TargetRegisterClass *RC; - LLT ArgTy; - std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); - if (!Arg) { - LLVM_DEBUG(dbgs() << "Required arg register missing\n"); - return nullptr; - } - return Arg; -} - bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, - const ArgDescriptor *Arg) const { - if (!Arg->isRegister() || !Arg->getRegister().isValid()) - return false; // TODO: Handle these - - Register SrcReg = Arg->getRegister(); + const ArgDescriptor *Arg, + const TargetRegisterClass *ArgRC, + LLT ArgTy) const { + MCRegister SrcReg = Arg->getRegister(); assert(SrcReg.isPhysical() && "Physical register expected"); assert(DstReg.isVirtual() && "Virtual register expected"); MachineRegisterInfo &MRI = *B.getMRI(); - - LLT Ty = MRI.getType(DstReg); - Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); + Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy); if (Arg->isMasked()) { // TODO: Should we try to emit this once in the entry block? @@ -2538,15 +2573,24 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, return true; } -bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, +bool AMDGPULegalizerInfo::loadInputValue( + Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + const ArgDescriptor *Arg; + const TargetRegisterClass *ArgRC; + LLT ArgTy; + std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); - const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); - if (!Arg) - return false; + if (!Arg->isRegister() || !Arg->getRegister().isValid()) + return false; // TODO: Handle these + return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); +} - if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) +bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) return false; MI.eraseFromParent(); @@ -3158,23 +3202,15 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); uint64_t Offset = ST.getTargetLowering()->getImplicitParameterOffset( B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); LLT DstTy = MRI.getType(DstReg); LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); - const ArgDescriptor *Arg; - const TargetRegisterClass *RC; - LLT ArgTy; - std::tie(Arg, RC, ArgTy) = - MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); - if (!Arg) - return false; - Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); - if (!loadInputValue(KernargPtrReg, B, Arg)) + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) return false; // FIXME: This should be nuw @@ -4110,8 +4146,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } bool AMDGPULegalizerInfo::legalizeSBufferLoad( - MachineInstr &MI, MachineIRBuilder &B, - GISelChangeObserver &Observer) const { + LegalizerHelper &Helper, MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + GISelChangeObserver &Observer = Helper.Observer; + Register Dst = MI.getOperand(0).getReg(); LLT Ty = B.getMRI()->getType(Dst); unsigned Size = Ty.getSizeInBits(); @@ -4119,6 +4157,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( Observer.changingInstr(MI); + if (shouldBitcastLoadStoreType(ST, Ty, Size)) { + Ty = getBitcastRegisterType(Ty); + Helper.bitcastDst(MI, Ty, 0); + Dst = MI.getOperand(0).getReg(); + B.setInsertPt(B.getMBB(), MI); + } + // FIXME: We don't really need this intermediate instruction. The intrinsic // should be fixed to have a memory operand. Since it's readnone, we're not // allowed to add one. @@ -4140,8 +4185,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( // always be legal. We may need to restore this to a 96-bit result if it turns // out this needs to be converted to a vector load during RegBankSelect. if (!isPowerOf2_32(Size)) { - LegalizerHelper Helper(MF, *this, Observer, B); - if (Ty.isVector()) Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); else @@ -4162,16 +4205,12 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, } else { // Pass queue pointer to trap handler as input, and insert trap instruction // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi - const ArgDescriptor *Arg = - getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); - if (!Arg) - return false; MachineRegisterInfo &MRI = *B.getMRI(); Register SGPR01(AMDGPU::SGPR0_SGPR1); Register LiveIn = getLiveInRegister( B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), /*InsertLiveInCopy=*/false); - if (!loadInputValue(LiveIn, B, Arg)) + if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return false; B.buildCopy(SGPR01, LiveIn); B.buildInstr(AMDGPU::S_TRAP) @@ -4337,7 +4376,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return true; } case Intrinsic::amdgcn_s_buffer_load: - return legalizeSBufferLoad(MI, B, Helper.Observer); + return legalizeSBufferLoad(Helper, MI); case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_struct_buffer_store: return legalizeBufferStore(MI, MRI, B, false, false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index d932cab4659ab..332d675c1a88e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -25,7 +25,7 @@ class LLVMContext; class GCNSubtarget; /// This class provides the information for the target register banks. -class AMDGPULegalizerInfo : public LegalizerInfo { +class AMDGPULegalizerInfo final : public LegalizerInfo { const GCNSubtarget &ST; public: @@ -91,11 +91,11 @@ class AMDGPULegalizerInfo : public LegalizerInfo { bool InsertLiveInCopy = true) const; Register insertLiveInCopy(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register LiveIn, Register PhyReg) const; - const ArgDescriptor * - getArgDescriptor(MachineIRBuilder &B, - AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; bool loadInputValue(Register DstReg, MachineIRBuilder &B, - const ArgDescriptor *Arg) const; + const ArgDescriptor *Arg, + const TargetRegisterClass *ArgRC, LLT ArgTy) const; + bool loadInputValue(Register DstReg, MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; bool legalizePreloadedArgIntrin( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; @@ -167,9 +167,7 @@ class AMDGPULegalizerInfo : public LegalizerInfo { GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const; - bool legalizeSBufferLoad( - MachineInstr &MI, MachineIRBuilder &B, - GISelChangeObserver &Observer) const; + bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const; bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, bool IsInc) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 4a14259f1bdb1..538a22df514fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -495,8 +495,7 @@ bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const { } bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { - return AllNative || - std::find(UseNative.begin(), UseNative.end(), F) != UseNative.end(); + return AllNative || llvm::is_contained(UseNative, F); } void AMDGPULibCalls::initNativeFuncs() { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 098b0e9938861..8a48ea5bd30c5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -234,7 +234,7 @@ namespace { #include "AMDGPUGenPostLegalizeGICombiner.inc" #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { +class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 800ad2039f0e9..aea148e910710 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -37,7 +37,7 @@ namespace { #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo { +class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index de05c2119d985..4cfe0edfc5330 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -605,7 +605,7 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca, if (isa(OtherOp)) return true; - Value *OtherObj = GetUnderlyingObject(OtherOp, *DL); + Value *OtherObj = getUnderlyingObject(OtherOp); if (!isa(OtherObj)) return false; @@ -749,41 +749,79 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { if (LocalMemLimit == 0) return false; - const DataLayout &DL = Mod->getDataLayout(); + SmallVector Stack; + SmallPtrSet VisitedConstants; + SmallPtrSet UsedLDS; + + auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { + for (const User *U : Val->users()) { + if (const Instruction *Use = dyn_cast(U)) { + if (Use->getParent()->getParent() == &F) + return true; + } else { + const Constant *C = cast(U); + if (VisitedConstants.insert(C).second) + Stack.push_back(C); + } + } + + return false; + }; - // Check how much local memory is being used by global objects - CurrentLocalMemUsage = 0; for (GlobalVariable &GV : Mod->globals()) { if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; - for (const User *U : GV.users()) { - const Instruction *Use = dyn_cast(U); - if (!Use) { - // FIXME: This is probably a constant expression use. We should - // recursively search the users of it for the parent function instead of - // bailing. - LLVM_DEBUG(dbgs() << "Giving up on LDS size estimate " - "due to constant expression\n"); - return false; - } + if (visitUsers(&GV, &GV)) { + UsedLDS.insert(&GV); + Stack.clear(); + continue; + } - if (Use->getParent()->getParent() == &F) { - Align Alignment = - DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); - - // FIXME: Try to account for padding here. The padding is currently - // determined from the inverse order of uses in the function. I'm not - // sure if the use list order is in any way connected to this, so the - // total reported size is likely incorrect. - uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment); - CurrentLocalMemUsage += AllocSize; + // For any ConstantExpr uses, we need to recursively search the users until + // we see a function. + while (!Stack.empty()) { + const Constant *C = Stack.pop_back_val(); + if (visitUsers(&GV, C)) { + UsedLDS.insert(&GV); + Stack.clear(); break; } } } + const DataLayout &DL = Mod->getDataLayout(); + SmallVector, 16> AllocatedSizes; + AllocatedSizes.reserve(UsedLDS.size()); + + for (const GlobalVariable *GV : UsedLDS) { + Align Alignment = + DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); + uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType()); + AllocatedSizes.emplace_back(AllocSize, Alignment); + } + + // Sort to try to estimate the worst case alignment padding + // + // FIXME: We should really do something to fix the addresses to a more optimal + // value instead + llvm::sort(AllocatedSizes.begin(), AllocatedSizes.end(), + [](std::pair LHS, std::pair RHS) { + return LHS.second < RHS.second; + }); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + + // FIXME: Try to account for padding here. The real padding and address is + // currently determined from the inverse order of uses in the function when + // legalizing, which could also potentially change. We try to estimate the + // worst case here, but we probably should fix the addresses earlier. + for (auto Alloc : AllocatedSizes) { + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second); + CurrentLocalMemUsage += Alloc.first; + } + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 71d82679b3ff1..a99dbeebfbf8c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -39,7 +39,7 @@ namespace { #include "AMDGPUGenRegBankGICombiner.inc" #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H -class AMDGPURegBankCombinerInfo : public CombinerInfo { +class AMDGPURegBankCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 93006bb68ed37..9674474cd3cf9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3232,7 +3232,7 @@ AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, LLT PtrTy = MRI.getType(PtrReg); unsigned Size = PtrTy.getSizeInBits(); if (Subtarget.useFlatForGlobal() || - !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) + !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); // If we're using MUBUF instructions for global memory, an SGPR base register @@ -3258,8 +3258,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); - if (PtrBank == &AMDGPU::SGPRRegBank && - SITargetLowering::isFlatGlobalAddrSpace(AS)) { + if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { if (isScalarLoadLegal(MI)) { // We have a uniform instruction so we want to use an SMRD load ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); @@ -4023,7 +4022,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_kernarg_segment_ptr: case Intrinsic::amdgcn_s_getpc: - case Intrinsic::amdgcn_groupstaticsize: { + case Intrinsic::amdgcn_groupstaticsize: + case Intrinsic::amdgcn_reloc_constant: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; @@ -4205,6 +4205,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_global_atomic_csub: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 23a466e9bd85b..2cfb32201a6a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -39,7 +39,7 @@ class AMDGPUGenRegisterBankInfo : public RegisterBankInfo { #define GET_TARGET_REGBANK_CLASS #include "AMDGPUGenRegisterBank.inc" }; -class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { +class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo { public: const GCNSubtarget &Subtarget; const SIRegisterInfo *TRI; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 9f6ebd00cd97b..6c70b53b23c11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// def SGPRRegBank : RegisterBank<"SGPR", - [SReg_LO16, SReg_32, SReg_64, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024] + [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024] >; def VGPRRegBank : RegisterBank<"VGPR", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 6b23830491235..0f57d34ba6aaf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -791,6 +791,10 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo, return CIInsts; } + /// \returns true if the target has integer add/sub instructions that do not + /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, + /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier + /// for saturation. bool hasAddNoCarry() const { return AddNoCarryInsts; } @@ -1206,6 +1210,10 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo, return getWavefrontSize() == 32; } + bool isWave64() const { + return getWavefrontSize() == 64; + } + const TargetRegisterClass *getBoolRC() const { return getRegisterInfo()->getBoolRC(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b4b10835837cd..b49a417dfb09c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -526,6 +526,12 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl( return I.get(); } +bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && + AMDGPU::isFlatGlobalAddrSpace(DestAS); +} + TargetTransformInfo R600TargetMachine::getTargetTransformInfo(const Function &F) { return TargetTransformInfo(R600TTIImpl(this, F)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index e223fecc88195..aedcaf3fe4149 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -62,6 +62,8 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0; } + + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 7998b0cb9f6ac..da00a993bd649 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -169,7 +169,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (AS == AMDGPUAS::PRIVATE_ADDRESS) { const Value *Ptr = GEP->getPointerOperand(); const AllocaInst *Alloca = - dyn_cast(GetUnderlyingObject(Ptr, DL)); + dyn_cast(getUnderlyingObject(Ptr)); if (!Alloca || !Alloca->isStaticAlloca()) continue; Type *Ty = Alloca->getAllocatedType(); @@ -583,13 +583,17 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, Opd1PropInfo, Opd2PropInfo, Args, CxtI); } -// Return true if there's a potential benefit from using v2f16 instructions for -// an intrinsic, even if it requires nontrivial legalization. +// Return true if there's a potential benefit from using v2f16/v2i16 +// instructions for an intrinsic, even if it requires nontrivial legalization. static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { switch (ID) { case Intrinsic::fma: // TODO: fmuladd // There's a small benefit to using vector ops in the legalized code. case Intrinsic::round: + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: return true; default: return false; @@ -930,7 +934,10 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Type *MaskTy = MaskOp->getType(); bool DoTruncate = false; - if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) { + + const GCNTargetMachine &TM = + static_cast(getTLI()->getTargetMachine()); + if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) { // All valid 64-bit to 32-bit casts work by chopping off the high // bits. Any masking only clearing the low bits will also apply in the new // address space. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp index 281ae6d646e9e..f898456203a10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -96,7 +96,7 @@ namespace { SmallVector All; for (auto MD : NamedMD->operands()) for (const auto &Op : MD->operands()) - if (std::find(All.begin(), All.end(), Op.get()) == All.end()) + if (!llvm::is_contained(All, Op.get())) All.push_back(Op.get()); NamedMD->eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 9f3a6ffc35e6f..2833875e438cd 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -689,6 +689,11 @@ class AMDGPUOperand : public MCParsedAsmOperand { return Imm.Val; } + void setImm(int64_t Val) { + assert(isImm()); + Imm.Val = Val; + } + ImmTy getImmTy() const { assert(isImm()); return Imm.Type; @@ -1297,8 +1302,13 @@ class AMDGPUAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); OperandMatchResultTy parseDfmtNfmt(int64_t &Format); OperandMatchResultTy parseUfmt(int64_t &Format); + OperandMatchResultTy parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format); + OperandMatchResultTy parseSymbolicUnifiedFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format); OperandMatchResultTy parseFORMAT(OperandVector &Operands); + OperandMatchResultTy parseSymbolicOrNumericFormat(int64_t &Format); + OperandMatchResultTy parseNumericFormat(int64_t &Format); bool tryParseFmt(const char *Pref, int64_t MaxVal, int64_t &Val); + bool matchDfmtNfmt(int64_t &Dfmt, int64_t &Nfmt, StringRef FormatStr, SMLoc Loc); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); } @@ -1367,6 +1377,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool trySkipToken(const AsmToken::TokenKind Kind); bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string"); + bool parseId(StringRef &Val, const StringRef ErrMsg); + void peekTokens(MutableArrayRef Tokens); AsmToken::TokenKind getTokenKind() const; bool parseExpr(int64_t &Imm); @@ -4926,8 +4938,8 @@ AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { if (Dfmt == DFMT_UNDEF && Nfmt == NFMT_UNDEF) return MatchOperand_NoMatch; - Dfmt = (Dfmt == DFMT_UNDEF)? DFMT_DEFAULT : Dfmt; - Nfmt = (Nfmt == NFMT_UNDEF)? NFMT_DEFAULT : Nfmt; + Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt; + Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt; Format = encodeDfmtNfmt(Dfmt, Nfmt); return MatchOperand_Success; @@ -4949,20 +4961,177 @@ AMDGPUAsmParser::parseUfmt(int64_t &Format) { return MatchOperand_Success; } +bool AMDGPUAsmParser::matchDfmtNfmt(int64_t &Dfmt, + int64_t &Nfmt, + StringRef FormatStr, + SMLoc Loc) { + using namespace llvm::AMDGPU::MTBUFFormat; + int64_t Format; + + Format = getDfmt(FormatStr); + if (Format != DFMT_UNDEF) { + Dfmt = Format; + return true; + } + + Format = getNfmt(FormatStr, getSTI()); + if (Format != NFMT_UNDEF) { + Nfmt = Format; + return true; + } + + Error(Loc, "unsupported format"); + return false; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr, + SMLoc FormatLoc, + int64_t &Format) { + using namespace llvm::AMDGPU::MTBUFFormat; + + int64_t Dfmt = DFMT_UNDEF; + int64_t Nfmt = NFMT_UNDEF; + if (!matchDfmtNfmt(Dfmt, Nfmt, FormatStr, FormatLoc)) + return MatchOperand_ParseFail; + + if (trySkipToken(AsmToken::Comma)) { + StringRef Str; + SMLoc Loc = getLoc(); + if (!parseId(Str, "expected a format string") || + !matchDfmtNfmt(Dfmt, Nfmt, Str, Loc)) { + return MatchOperand_ParseFail; + } + if (Dfmt == DFMT_UNDEF) { + Error(Loc, "duplicate numeric format"); + } else if (Nfmt == NFMT_UNDEF){ + Error(Loc, "duplicate data format"); + } + } + + Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt; + Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt; + + if (isGFX10()) { + auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt); + if (Ufmt == UFMT_UNDEF) + Error(FormatLoc, "unsupported format"); + Format = Ufmt; + } else { + Format = encodeDfmtNfmt(Dfmt, Nfmt); + } + + return MatchOperand_Success; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr, + SMLoc Loc, + int64_t &Format) { + using namespace llvm::AMDGPU::MTBUFFormat; + + auto Id = getUnifiedFormat(FormatStr); + if (Id == UFMT_UNDEF) + return MatchOperand_NoMatch; + + if (!isGFX10()) { + Error(Loc, "unified format is not supported on this GPU"); + return MatchOperand_ParseFail; + } + + Format = Id; + return MatchOperand_Success; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseNumericFormat(int64_t &Format) { + using namespace llvm::AMDGPU::MTBUFFormat; + SMLoc Loc = getLoc(); + + if (!parseExpr(Format)) + return MatchOperand_ParseFail; + if (!isValidFormatEncoding(Format, getSTI())) { + Error(Loc, "out of range format"); + return MatchOperand_ParseFail; + } + + return MatchOperand_Success; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSymbolicOrNumericFormat(int64_t &Format) { + using namespace llvm::AMDGPU::MTBUFFormat; + + if (!trySkipId("format", AsmToken::Colon)) + return MatchOperand_NoMatch; + + if (trySkipToken(AsmToken::LBrac)) { + StringRef FormatStr; + SMLoc Loc = getLoc(); + if (!parseId(FormatStr, "expected a format string")) + return MatchOperand_ParseFail; + + auto Res = parseSymbolicUnifiedFormat(FormatStr, Loc, Format); + if (Res == MatchOperand_NoMatch) + Res = parseSymbolicSplitFormat(FormatStr, Loc, Format); + if (Res != MatchOperand_Success) + return Res; + + skipToken(AsmToken::RBrac, "expected a closing square bracket"); + return MatchOperand_Success; + } + + return parseNumericFormat(Format); +} + OperandMatchResultTy AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) { using namespace llvm::AMDGPU::MTBUFFormat; - int64_t Format = isGFX10() ? UFMT_DEFAULT : DFMT_NFMT_DEFAULT; + int64_t Format = getDefaultFormatEncoding(getSTI()); OperandMatchResultTy Res; SMLoc Loc = getLoc(); + // Parse legacy format syntax. Res = isGFX10() ? parseUfmt(Format) : parseDfmtNfmt(Format); if (Res == MatchOperand_ParseFail) return Res; + bool FormatFound = (Res == MatchOperand_Success); + Operands.push_back( AMDGPUOperand::CreateImm(this, Format, Loc, AMDGPUOperand::ImmTyFORMAT)); + + if (FormatFound) + trySkipToken(AsmToken::Comma); + + if (isToken(AsmToken::EndOfStatement)) { + // We are expecting an soffset operand, + // but let matcher handle the error. + return MatchOperand_Success; + } + + // Parse soffset. + Res = parseRegOrImm(Operands); + if (Res != MatchOperand_Success) + return Res; + + trySkipToken(AsmToken::Comma); + + if (!FormatFound) { + if (parseSymbolicOrNumericFormat(Format) == MatchOperand_Success) { + auto Size = Operands.size(); + AMDGPUOperand &Op = static_cast(*Operands[Size - 2]); + assert(Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyFORMAT); + Op.setImm(Format); + } + return MatchOperand_Success; + } + + if (isId("format") && peekToken().is(AsmToken::Colon)) { + Error(getLoc(), "duplicate format"); + return MatchOperand_ParseFail; + } return MatchOperand_Success; } @@ -5616,6 +5785,18 @@ AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { } } +bool +AMDGPUAsmParser::parseId(StringRef &Val, const StringRef ErrMsg) { + if (isToken(AsmToken::Identifier)) { + Val = getTokenStr(); + lex(); + return true; + } else { + Error(getLoc(), ErrMsg); + return false; + } +} + AsmToken AMDGPUAsmParser::getToken() const { return Parser.getTok(); @@ -5623,7 +5804,7 @@ AMDGPUAsmParser::getToken() const { AsmToken AMDGPUAsmParser::peekToken() { - return getLexer().peekTok(); + return isToken(AsmToken::EndOfStatement) ? getToken() : getLexer().peekTok(); } void diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 370e9db9e83e9..bb04ae06bc3b4 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -114,6 +114,7 @@ class MTBUF_Real : let isCodeGenOnly = 0; // copy relevant pseudo op flags + let UseNamedOperandTable = ps.UseNamedOperandTable; let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; let Constraints = ps.Constraints; @@ -1006,7 +1007,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < let SubtargetPredicate = HasGFX10_BEncoding in defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32 + "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub >; let SubtargetPredicate = isGFX8GFX9 in { diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 69facada2e964..f99354b77272f 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -662,7 +662,7 @@ defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", let SubtargetPredicate = HasGFX10_BEncoding in defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub", - VGPR_32, i32, atomic_csub_global_32>; + VGPR_32, i32, int_amdgcn_global_atomic_csub>; } // End is_flat_global = 1 @@ -959,7 +959,7 @@ def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; +def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 8b446a4342986..30a563fdc1dfd 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -715,8 +715,9 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { return -1; } -int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, - const MachineRegisterInfo &MRI) { +int +GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, + const MachineRegisterInfo &MRI) { // Helper to check for the hazard where VMEM instructions that store more than // 8 bytes can have there store data over written by the next instruction. const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1383,3 +1384,27 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { return WaitStatesNeeded; } + +bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { + if (!SU->isInstr()) + return false; + + MachineInstr *MAI = nullptr; + auto IsMFMAFn = [&MAI] (MachineInstr *MI) { + MAI = nullptr; + if (SIInstrInfo::isMAI(*MI) && + MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && + MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) + MAI = MI; + return MAI != nullptr; + }; + + MachineInstr *MI = SU->getInstr(); + if (IsMFMAFn(MI)) { + int W = getWaitStatesSince(IsMFMAFn, 16); + if (MAI) + return W < (int)TSchedModel.computeInstrLatency(MAI); + } + + return false; +} diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 59d6d4f90755a..629fae92e0058 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -108,6 +108,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { unsigned PreEmitNoopsCommon(MachineInstr *); void AdvanceCycle() override; void RecedeCycle() override; + bool ShouldPreferAnother(SUnit *SU) override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp index 98d971630ca4f..79b33e24c8302 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -83,9 +83,10 @@ class GCNRegBankReassign : public MachineFunctionPass { class Candidate { public: - Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, - unsigned weight) - : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} + Candidate(MachineInstr *mi, unsigned reg, unsigned subreg, + unsigned freebanks, unsigned weight) + : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks), + Weight(weight) {} bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } @@ -100,6 +101,7 @@ class GCNRegBankReassign : public MachineFunctionPass { MachineInstr *MI; unsigned Reg; + unsigned SubReg; unsigned FreeBanks; unsigned Weight; }; @@ -162,7 +164,7 @@ class GCNRegBankReassign : public MachineFunctionPass { const MCPhysReg *CSRegs; // Returns bank for a phys reg. - unsigned getPhysRegBank(unsigned Reg) const; + unsigned getPhysRegBank(unsigned Reg, unsigned SubReg) const; // Return a bit set for each register bank used. 4 banks for VGPRs and // 8 banks for SGPRs. @@ -176,7 +178,7 @@ class GCNRegBankReassign : public MachineFunctionPass { // a register chosen from Bank. std::pair analyzeInst(const MachineInstr &MI, unsigned Reg = AMDGPU::NoRegister, - int Bank = -1); + unsigned SubReg = 0, int Bank = -1); // Return true if register is regular VGPR or SGPR or their tuples. // Returns false for special registers like m0, vcc etc. @@ -216,11 +218,12 @@ class GCNRegBankReassign : public MachineFunctionPass { // candidates are collected and added to work list. unsigned computeStallCycles(unsigned SrcReg, unsigned Reg = AMDGPU::NoRegister, - int Bank = -1, bool Collect = false); + unsigned SubReg = 0, int Bank = -1, + bool Collect = false); // Search for a register in Bank unused within LI. // Returns phys reg or NoRegister. - unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; + unsigned scavengeReg(LiveInterval &LI, unsigned Bank, unsigned SubReg) const; // Try to reassign candidate. Returns number or stall cycles saved. unsigned tryReassign(Candidate &C); @@ -277,15 +280,24 @@ char GCNRegBankReassign::ID = 0; char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; -unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { +unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg, + unsigned SubReg) const { assert(Register::isPhysicalRegister(Reg)); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getRegSizeInBits(*RC); if (Size == 16) Reg = TRI->get32BitRegister(Reg); - else if (Size > 32) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + else if (Size > 32) { + if (SubReg) { + const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg); + Reg = TRI->getSubReg(Reg, SubReg); + if (TRI->getRegSizeInBits(*SubRC) > 32) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + } else { + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + } + } if (TRI->hasVGPRs(RC)) { Reg -= AMDGPU::VGPR0; @@ -360,7 +372,7 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, std::pair GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg, - int Bank) { + unsigned SubReg, int Bank) { unsigned StallCycles = 0; unsigned UsedBanks = 0; @@ -375,26 +387,39 @@ GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg, if (!Op.isReg() || Op.isUndef()) continue; - Register R = Op.getReg(); - if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) - continue; + const Register R = Op.getReg(); + const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R); - unsigned ShiftedBank = Bank; + // Do not compute stalls for AGPRs + if (TRI->hasAGPRs(RC)) + continue; - if (Bank != -1 && R == Reg && Op.getSubReg()) { - unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg()); + // Do not compute stalls if sub-register covers all banks + if (Op.getSubReg()) { LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); - if (Offset && Bank < NUM_VGPR_BANKS) { - // If a register spans all banks we cannot shift it to avoid conflict. + if (TRI->hasVGPRs(RC)) { if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) continue; - ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS; - } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) { - // If a register spans all banks we cannot shift it to avoid conflict. + } else { if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) continue; + } + } + + unsigned ShiftedBank = Bank; + + if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) { + unsigned RegOffset = + TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0); + unsigned Offset = TRI->getChannelFromSubReg( + Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0); + if (Bank < NUM_VGPR_BANKS) { + unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset); + ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS; + } else if (Bank >= SGPR_BANK_OFFSET) { + unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1); ShiftedBank = SGPR_BANK_OFFSET + - (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS; + (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS; } } @@ -576,17 +601,17 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI, unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); if (FreeBanks1) - Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight - + ((Size2 > Size1) ? 1 : 0))); + Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1, + Weight + ((Size2 > Size1) ? 1 : 0))); if (FreeBanks2) - Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight - + ((Size1 > Size2) ? 1 : 0))); + Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2, + Weight + ((Size1 > Size2) ? 1 : 0))); } } } -unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, - unsigned Reg, int Bank, +unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, unsigned Reg, + unsigned SubReg, int Bank, bool Collect) { unsigned TotalStallCycles = 0; SmallSet Visited; @@ -598,7 +623,7 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, continue; unsigned StallCycles; unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank); + std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank); TotalStallCycles += StallCycles; if (Collect) collectCandidates(MI, UsedBanks, StallCycles); @@ -607,8 +632,8 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, return TotalStallCycles; } -unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, - unsigned Bank) const { +unsigned GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank, + unsigned SubReg) const { const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs : MaxNumSGPRs; @@ -620,7 +645,7 @@ unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, if (TRI->isSubRegisterEq(Reg, MaxReg)) break; - if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) + if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank) continue; for (unsigned I = 0; CSRegs[I]; ++I) @@ -669,7 +694,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) { for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { if (C.FreeBanks & (1 << Bank)) { LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); - unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); + unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank); if (Stalls < OrigStalls) { LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " << Stalls << '\n'); @@ -683,7 +708,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) { LRM->unassign(LI); while (!BankStalls.empty()) { BankStall BS = BankStalls.pop_back_val(); - unsigned Reg = scavengeReg(LI, BS.Bank); + unsigned Reg = scavengeReg(LI, BS.Bank, C.SubReg); if (Reg == AMDGPU::NoRegister) { LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) << '\n'); @@ -801,7 +826,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { Candidates.pop_back(); if (LocalCyclesSaved) { removeCandidates(C.Reg); - computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); + computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true); Candidates.sort(); LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index e44c0194e811b..7a54f49c18b08 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -299,23 +299,48 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { +} + +void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { using namespace llvm::AMDGPU::MTBUFFormat; + int OpNo = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::format); + assert(OpNo != -1); + unsigned Val = MI->getOperand(OpNo).getImm(); if (AMDGPU::isGFX10(STI)) { if (Val == UFMT_DEFAULT) return; - O << " format:" << Val; + if (isValidUnifiedFormat(Val)) { + O << " format:[" << getUnifiedFormatName(Val) << ']'; + } else { + O << " format:" << Val; + } } else { if (Val == DFMT_NFMT_DEFAULT) return; - unsigned Dfmt; - unsigned Nfmt; - decodeDfmtNfmt(Val, Dfmt, Nfmt); - O << " dfmt:" << Dfmt; - O << ", nfmt:" << Nfmt; + if (isValidDfmtNfmt(Val, STI)) { + unsigned Dfmt; + unsigned Nfmt; + decodeDfmtNfmt(Val, Dfmt, Nfmt); + O << " format:["; + if (Dfmt != DFMT_DEFAULT) { + O << getDfmtName(Dfmt); + if (Nfmt != NFMT_DEFAULT) { + O << ','; + } + } + if (Nfmt != NFMT_DEFAULT) { + O << getNfmtName(Nfmt, STI); + } + O << ']'; + } else { + O << " format:" << Val; + } } - O << ','; } void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, @@ -682,6 +707,14 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, printDefaultVccOperand(OpNo, STI, O); break; } + + if (Desc.TSFlags & SIInstrFlags::MTBUF) { + int SOffsetIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::soffset); + assert(SOffsetIdx != -1); + if ((int)OpNo == SOffsetIdx) + printSymbolicFormat(MI, STI, O); + } } void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 78a66a78ec341..ed45c5309ea25 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -99,6 +99,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { const MCSubtargetInfo &STI, raw_ostream &O); void printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSymbolicFormat(const MCInst *MI, + const MCSubtargetInfo &STI, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td index 2cc21364c4397..055e2de59ea1d 100644 --- a/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -353,8 +353,8 @@ class LoadVtxId1 : PatFrag < const MemSDNode *LD = cast(N); return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - !isa(GetUnderlyingObject( - LD->getMemOperand()->getValue(), CurDAG->getDataLayout()))); + !isa(getUnderlyingObject( + LD->getMemOperand()->getValue()))); }]>; def vtx_id1_az_extloadi8 : LoadVtxId1 ; @@ -365,8 +365,8 @@ class LoadVtxId2 : PatFrag < (ops node:$ptr), (load node:$ptr), [{ const MemSDNode *LD = cast(N); return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa(GetUnderlyingObject( - LD->getMemOperand()->getValue(), CurDAG->getDataLayout())); + isa(getUnderlyingObject( + LD->getMemOperand()->getValue())); }]>; def vtx_id2_az_extloadi8 : LoadVtxId2 ; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 9c9dd66a4a79c..58d77f3b224b5 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -394,27 +394,56 @@ enum ModeRegisterMasks : uint32_t { namespace MTBUFFormat { -enum DataFormat { - DFMT_MAX = 15, +enum DataFormat : int64_t { + DFMT_INVALID = 0, + DFMT_8, + DFMT_16, + DFMT_8_8, + DFMT_32, + DFMT_16_16, + DFMT_10_11_11, + DFMT_11_11_10, + DFMT_10_10_10_2, + DFMT_2_10_10_10, + DFMT_8_8_8_8, + DFMT_32_32, + DFMT_16_16_16_16, + DFMT_32_32_32, + DFMT_32_32_32_32, + DFMT_RESERVED_15, + + DFMT_MIN = DFMT_INVALID, + DFMT_MAX = DFMT_RESERVED_15, DFMT_UNDEF = -1, - DFMT_DEFAULT = 1, + DFMT_DEFAULT = DFMT_8, DFMT_SHIFT = 0, - DFMT_MASK = DFMT_MAX + DFMT_MASK = 0xF }; -enum NumFormat { - NFMT_MAX = 7, +enum NumFormat : int64_t { + NFMT_UNORM = 0, + NFMT_SNORM, + NFMT_USCALED, + NFMT_SSCALED, + NFMT_UINT, + NFMT_SINT, + NFMT_RESERVED_6, // VI and GFX9 + NFMT_SNORM_OGL = NFMT_RESERVED_6, // SI and CI only + NFMT_FLOAT, + + NFMT_MIN = NFMT_UNORM, + NFMT_MAX = NFMT_FLOAT, NFMT_UNDEF = -1, - NFMT_DEFAULT = 0, + NFMT_DEFAULT = NFMT_UNORM, NFMT_SHIFT = 4, - NFMT_MASK = NFMT_MAX + NFMT_MASK = 7 }; -enum MergedFormat { +enum MergedFormat : int64_t { DFMT_NFMT_UNDEF = -1, DFMT_NFMT_DEFAULT = ((DFMT_DEFAULT & DFMT_MASK) << DFMT_SHIFT) | ((NFMT_DEFAULT & NFMT_MASK) << NFMT_SHIFT), @@ -425,11 +454,106 @@ enum MergedFormat { DFMT_NFMT_MAX = DFMT_NFMT_MASK }; -enum UnifiedFormat { +enum UnifiedFormat : int64_t { + UFMT_INVALID = 0, + + UFMT_8_UNORM, + UFMT_8_SNORM, + UFMT_8_USCALED, + UFMT_8_SSCALED, + UFMT_8_UINT, + UFMT_8_SINT, + + UFMT_16_UNORM, + UFMT_16_SNORM, + UFMT_16_USCALED, + UFMT_16_SSCALED, + UFMT_16_UINT, + UFMT_16_SINT, + UFMT_16_FLOAT, + + UFMT_8_8_UNORM, + UFMT_8_8_SNORM, + UFMT_8_8_USCALED, + UFMT_8_8_SSCALED, + UFMT_8_8_UINT, + UFMT_8_8_SINT, + + UFMT_32_UINT, + UFMT_32_SINT, + UFMT_32_FLOAT, + + UFMT_16_16_UNORM, + UFMT_16_16_SNORM, + UFMT_16_16_USCALED, + UFMT_16_16_SSCALED, + UFMT_16_16_UINT, + UFMT_16_16_SINT, + UFMT_16_16_FLOAT, + + UFMT_10_11_11_UNORM, + UFMT_10_11_11_SNORM, + UFMT_10_11_11_USCALED, + UFMT_10_11_11_SSCALED, + UFMT_10_11_11_UINT, + UFMT_10_11_11_SINT, + UFMT_10_11_11_FLOAT, + + UFMT_11_11_10_UNORM, + UFMT_11_11_10_SNORM, + UFMT_11_11_10_USCALED, + UFMT_11_11_10_SSCALED, + UFMT_11_11_10_UINT, + UFMT_11_11_10_SINT, + UFMT_11_11_10_FLOAT, + + UFMT_10_10_10_2_UNORM, + UFMT_10_10_10_2_SNORM, + UFMT_10_10_10_2_USCALED, + UFMT_10_10_10_2_SSCALED, + UFMT_10_10_10_2_UINT, + UFMT_10_10_10_2_SINT, + + UFMT_2_10_10_10_UNORM, + UFMT_2_10_10_10_SNORM, + UFMT_2_10_10_10_USCALED, + UFMT_2_10_10_10_SSCALED, + UFMT_2_10_10_10_UINT, + UFMT_2_10_10_10_SINT, + + UFMT_8_8_8_8_UNORM, + UFMT_8_8_8_8_SNORM, + UFMT_8_8_8_8_USCALED, + UFMT_8_8_8_8_SSCALED, + UFMT_8_8_8_8_UINT, + UFMT_8_8_8_8_SINT, + + UFMT_32_32_UINT, + UFMT_32_32_SINT, + UFMT_32_32_FLOAT, + + UFMT_16_16_16_16_UNORM, + UFMT_16_16_16_16_SNORM, + UFMT_16_16_16_16_USCALED, + UFMT_16_16_16_16_SSCALED, + UFMT_16_16_16_16_UINT, + UFMT_16_16_16_16_SINT, + UFMT_16_16_16_16_FLOAT, + + UFMT_32_32_32_UINT, + UFMT_32_32_32_SINT, + UFMT_32_32_32_FLOAT, + UFMT_32_32_32_32_UINT, + UFMT_32_32_32_32_SINT, + UFMT_32_32_32_32_FLOAT, + + UFMT_FIRST = UFMT_INVALID, + UFMT_LAST = UFMT_32_32_32_32_FLOAT, + UFMT_MAX = 127, UFMT_UNDEF = -1, - UFMT_DEFAULT = 1 + UFMT_DEFAULT = UFMT_8_UNORM }; } // namespace MTBUFFormat diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index ea1d20f7387df..67b91e14fcca8 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -463,7 +463,18 @@ static bool tryAddToFoldList(SmallVectorImpl &FoldList, static bool isUseSafeToFold(const SIInstrInfo *TII, const MachineInstr &MI, const MachineOperand &UseMO) { - return !UseMO.isUndef() && !TII->isSDWA(MI); + if (UseMO.isUndef() || TII->isSDWA(MI)) + return false; + + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: + // Do not fold into an indirect mov. + return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0); + } + + return true; //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } @@ -643,38 +654,35 @@ void SIFoldOperands::foldOperand( if (FoldingImmLike && UseMI->isCopy()) { Register DestReg = UseMI->getOperand(0).getReg(); + Register SrcReg = UseMI->getOperand(1).getReg(); + assert(SrcReg.isVirtual()); - // Don't fold into a copy to a physical register. Doing so would interfere - // with the register coalescer's logic which would avoid redundant - // initalizations. - if (DestReg.isPhysical()) + const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); + + // Don't fold into a copy to a physical register with the same class. Doing + // so would interfere with the register coalescer's logic which would avoid + // redundant initalizations. + if (DestReg.isPhysical() && SrcRC->contains(DestReg)) return; - const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg); + const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); + if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { + MachineRegisterInfo::use_iterator NextUse; + SmallVector CopyUses; + for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(DestReg), + E = MRI->use_end(); + Use != E; Use = NextUse) { + NextUse = std::next(Use); + // There's no point trying to fold into an implicit operand. + if (Use->isImplicit()) + continue; - Register SrcReg = UseMI->getOperand(1).getReg(); - if (SrcReg.isVirtual()) { // XXX - This can be an assert? - const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); - if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { - MachineRegisterInfo::use_iterator NextUse; - SmallVector CopyUses; - for (MachineRegisterInfo::use_iterator - Use = MRI->use_begin(DestReg), E = MRI->use_end(); - Use != E; Use = NextUse) { - NextUse = std::next(Use); - - // There's no point trying to fold into an implicit operand. - if (Use->isImplicit()) - continue; - - FoldCandidate FC = FoldCandidate(Use->getParent(), - Use.getOperandNo(), &UseMI->getOperand(1)); - CopyUses.push_back(FC); - } - for (auto & F : CopyUses) { - foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, - FoldList, CopiesToReplace); - } + FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(), + &UseMI->getOperand(1)); + CopyUses.push_back(FC); + } + for (auto &F : CopyUses) { + foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace); } } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index a2e802009d098..a5b04570655a4 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -274,6 +274,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( return; } + // For GFX9. BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) .addReg(FlatScrInitLo) .addReg(ScratchWaveOffsetReg); @@ -284,7 +285,7 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( return; } - assert(ST.getGeneration() < AMDGPUSubtarget::GFX10); + assert(ST.getGeneration() < AMDGPUSubtarget::GFX9); // Copy the size in bytes. BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index fa2a96d33952b..59f45f9daf428 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -487,6 +487,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasBFE()) setHasExtractBitsInsn(true); + // Clamp modifier on add/sub + if (Subtarget->hasIntClamp()) { + setOperationAction(ISD::UADDSAT, MVT::i32, Legal); + setOperationAction(ISD::USUBSAT, MVT::i32, Legal); + } + + if (Subtarget->hasAddNoCarry()) { + setOperationAction(ISD::SADDSAT, MVT::i16, Legal); + setOperationAction(ISD::SSUBSAT, MVT::i16, Legal); + setOperationAction(ISD::SADDSAT, MVT::i32, Legal); + setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); + } + setOperationAction(ISD::FMINNUM, MVT::f32, Custom); setOperationAction(ISD::FMAXNUM, MVT::f32, Custom); setOperationAction(ISD::FMINNUM, MVT::f64, Custom); @@ -539,6 +552,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UDIV, MVT::i16, Promote); setOperationAction(ISD::SREM, MVT::i16, Promote); setOperationAction(ISD::UREM, MVT::i16, Promote); + setOperationAction(ISD::UADDSAT, MVT::i16, Legal); + setOperationAction(ISD::USUBSAT, MVT::i16, Legal); setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); @@ -703,6 +718,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMAX, MVT::v2i16, Legal); setOperationAction(ISD::UMAX, MVT::v2i16, Legal); + setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal); + setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal); + setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal); + setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal); + setOperationAction(ISD::FADD, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); @@ -730,6 +750,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v4i16, Custom); setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom); + setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom); + setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom); + setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom); + setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); setOperationAction(ISD::FMA, MVT::v4f16, Custom); @@ -1137,7 +1162,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | MachineMemOperand::MOVolatile; return true; } @@ -1176,9 +1200,13 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: { + case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_global_atomic_csub: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); Ops.push_back(Ptr); @@ -1450,11 +1478,6 @@ EVT SITargetLowering::getOptimalMemOpType( return MVT::Other; } -bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, - unsigned DestAS) const { - return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); -} - bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { const MemSDNode *MemNode = cast(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); @@ -1469,7 +1492,9 @@ bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, if (SrcAS == AMDGPUAS::FLAT_ADDRESS) return true; - return isNoopAddrSpaceCast(SrcAS, DestAS); + const GCNTargetMachine &TM = + static_cast(getTargetMachine()); + return TM.isNoopAddrSpaceCast(SrcAS, DestAS); } bool SITargetLowering::isMemOpUniform(const SDNode *N) const { @@ -2257,8 +2282,10 @@ SDValue SITargetLowering::LowerFormalArguments( if (Arg.Flags.isByRef()) { SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); - if (!isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, - Arg.Flags.getPointerAddrSpace())) { + const GCNTargetMachine &TM = + static_cast(getTargetMachine()); + if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace())) { Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, Arg.Flags.getPointerAddrSpace()); } @@ -3630,13 +3657,13 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, // to avoid interfering with other uses, so probably requires a new // optimization pass. BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, 0, SubReg) .addReg(SrcReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, 0, SubReg) .addReg(SrcReg, RegState::Implicit); } @@ -3659,13 +3686,13 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, if (UseGPRIdxMode) { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, 0, SubReg) .addReg(SrcReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, 0, SubReg) .addReg(SrcReg, RegState::Implicit); } @@ -4470,6 +4497,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMUL: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: + case ISD::UADDSAT: + case ISD::USUBSAT: + case ISD::SADDSAT: + case ISD::SSUBSAT: return splitBinaryVectorOp(Op, DAG); case ISD::SMULO: case ISD::UMULO: @@ -7193,19 +7224,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } - case Intrinsic::amdgcn_global_atomic_csub: { - MemSDNode *M = cast(Op); - SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value - }; - - return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op), - M->getVTList(), Ops, M->getMemoryVT(), - M->getMemOperand()); - } - default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) @@ -8487,7 +8505,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co unsigned AS = AtomicNode->getAddressSpace(); // No custom lowering required for local address space - if (!isFlatGlobalAddrSpace(AS)) + if (!AMDGPU::isFlatGlobalAddrSpace(AS)) return Op; // Non-local address space requires custom lowering for atomic compare diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index f4c0764640575..19dea37ad410d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -275,15 +275,6 @@ class SITargetLowering final : public AMDGPUTargetLowering { AS == AMDGPUAS::PRIVATE_ADDRESS; } - // FIXME: Missing constant_32bit - static bool isFlatGlobalAddrSpace(unsigned AS) { - return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS || - AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; - } - - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; TargetLoweringBase::LegalizeTypeAction diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 2a157eb20ab47..5ab6edf3f606b 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -963,26 +963,28 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( int CallAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - RegInterval CallAddrOpInterval = + + if (MI.getOperand(CallAddrOpIdx).isReg()) { + RegInterval CallAddrOpInterval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx); - for (int RegNo = CallAddrOpInterval.first; - RegNo < CallAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait( + for (int RegNo = CallAddrOpInterval.first; + RegNo < CallAddrOpInterval.second; ++RegNo) + ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); - int RtnAddrOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); - if (RtnAddrOpIdx != -1) { - RegInterval RtnAddrOpInterval = + int RtnAddrOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + if (RtnAddrOpIdx != -1) { + RegInterval RtnAddrOpInterval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx); - for (int RegNo = RtnAddrOpInterval.first; - RegNo < RtnAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait( + for (int RegNo = RtnAddrOpInterval.first; + RegNo < RtnAddrOpInterval.second; ++RegNo) + ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); + } } - } else { // FIXME: Should not be relying on memoperands. // Look at the source operands of every instruction to see if diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5d7cd5ffc4ce7..5c096cae87c1d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -459,10 +459,8 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, auto Base2 = MO2->getValue(); if (!Base1 || !Base2) return false; - const MachineFunction &MF = *MI1.getParent()->getParent(); - const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); - Base1 = GetUnderlyingObject(Base1, DL); - Base2 = GetUnderlyingObject(Base2, DL); + Base1 = getUnderlyingObject(Base1); + Base2 = getUnderlyingObject(Base2); if (isa(Base1) || isa(Base2)) return false; @@ -474,65 +472,29 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, unsigned NumLoads, unsigned NumBytes) const { + // If the mem ops (to be clustered) do not have the same base ptr, then they + // should not be clustered assert(!BaseOps1.empty() && !BaseOps2.empty()); const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); - if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; - const MachineOperand *FirstDst = nullptr; - const MachineOperand *SecondDst = nullptr; - - if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || - (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) || - (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { - const unsigned MaxGlobalLoadCluster = 7; - if (NumLoads > MaxGlobalLoadCluster) - return false; - - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); - if (!FirstDst) - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); - if (!SecondDst) - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); - } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } - - if (!FirstDst || !SecondDst) - return false; - - // Try to limit clustering based on the total number of bytes loaded - // rather than the number of instructions. This is done to help reduce - // register pressure. The method used is somewhat inexact, though, - // because it assumes that all loads in the cluster will load the - // same number of bytes as FirstLdSt. - - // The unit of this value is bytes. - // FIXME: This needs finer tuning. - unsigned LoadClusterThreshold = 16; - - const MachineRegisterInfo &MRI = - FirstLdSt.getParent()->getParent()->getRegInfo(); - - const Register Reg = FirstDst->getReg(); - - const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : RI.getPhysRegClass(Reg); - - // FIXME: NumLoads should not be subtracted 1. This is to match behavior - // of clusterNeighboringMemOps which was previosly passing cluster length - // less 1. LoadClusterThreshold should be tuned instead. - return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= - LoadClusterThreshold; + // In order to avoid regester pressure, on an average, the number of DWORDS + // loaded together by all clustered mem ops should not exceed 8. This is an + // empirical value based on certain observations and performance related + // experiments. + // The good thing about this heuristic is - it avoids clustering of too many + // sub-word loads, and also avoids clustering of wide loads. Below is the + // brief summary of how the heuristic behaves for various `LoadSize`. + // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops + // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops + // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops + // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops + // (5) LoadSize >= 17: do not cluster + const unsigned LoadSize = NumBytes / NumLoads; + const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; + return NumDWORDs <= 8; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, @@ -578,7 +540,9 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, - RegScavenger &RS) { + RegScavenger &RS, + Register ImpDefSuperReg = Register(), + Register ImpUseSuperReg = Register()) { const SIRegisterInfo &RI = TII.getRegisterInfo(); assert(AMDGPU::SReg_32RegClass.contains(SrcReg) || @@ -609,8 +573,17 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, DefOp.setIsKill(false); } - BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + MachineInstrBuilder Builder = + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) .add(DefOp); + if (ImpDefSuperReg) + Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); + + if (ImpUseSuperReg) { + Builder.addReg(ImpUseSuperReg, + getKillRegState(KillSrc) | RegState::Implicit); + } + return; } @@ -640,9 +613,27 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, RS.setRegUsed(Tmp); } - TII.copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); - BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + // Insert copy to temporary VGPR. + unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; + if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { + TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32; + } else { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + } + + MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) + .addReg(SrcReg, getKillRegState(KillSrc)); + if (ImpUseSuperReg) { + UseBuilder.addReg(ImpUseSuperReg, + getKillRegState(KillSrc) | RegState::Implicit); + } + + MachineInstrBuilder DefBuilder + = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) .addReg(Tmp, RegState::Kill); + + if (ImpDefSuperReg) + DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); } void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -863,11 +854,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } } else if (RI.hasAGPRs(RC)) { Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? - AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::INSTRUCTION_LIST_END; } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { Opcode = AMDGPU::V_ACCVGPR_READ_B32; } + // For the cases where we need an intermediate instruction/temporary register + // (the result is an SGPR, and the source is either an SGPR or AGPR), we need + // a scavenger. + // + // FIXME: The pass should maintain this for us so we don't have to re-scan the + // whole block for every handled copy. + std::unique_ptr RS; + if (Opcode == AMDGPU::INSTRUCTION_LIST_END) + RS.reset(new RegScavenger()); + ArrayRef SubIndices = RI.getRegSplitParts(RC, EltSize); bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); @@ -878,22 +879,24 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else SubIdx = SubIndices[SubIndices.size() - Idx - 1]; - if (Opcode == TargetOpcode::COPY) { - copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), - RI.getSubReg(SrcReg, SubIdx), KillSrc); - continue; - } - MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, - get(Opcode), RI.getSubReg(DestReg, SubIdx)); - - Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); + bool UseKill = KillSrc && Idx == SubIndices.size() - 1; - if (Idx == 0) - Builder.addReg(DestReg, RegState::Define | RegState::Implicit); + if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { + Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); + Register ImpUseSuper = SrcReg; + indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), + RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, + ImpDefSuper, ImpUseSuper); + } else { + MachineInstrBuilder Builder = + BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) + .addReg(RI.getSubReg(SrcReg, SubIdx)); + if (Idx == 0) + Builder.addReg(DestReg, RegState::Define | RegState::Implicit); - bool UseKill = KillSrc && Idx == SubIndices.size() - 1; - Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + } } } @@ -2392,7 +2395,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, case VCCZ: { const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); - assert(MRI.getRegClass(FalseReg) == RC); + if (MRI.getRegClass(FalseReg) != RC) + return false; int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? @@ -2406,7 +2410,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, // with a vector one. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); - assert(MRI.getRegClass(FalseReg) == RC); + if (MRI.getRegClass(FalseReg) != RC) + return false; int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 72feff80ac81c..d146114074a7e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -55,10 +55,6 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; @@ -311,10 +307,6 @@ class isPackedType { // PatFrags for global memory operations //===----------------------------------------------------------------------===// -let AddressSpaces = !cast("LoadAddress_global").AddrSpaces in { -defm atomic_csub_global : binary_atomic_op; -} - foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { let AddressSpaces = !cast("LoadAddress_"#as).AddrSpaces in { @@ -668,7 +660,6 @@ multiclass SIAtomicM0Glue2 ; defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; -defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>; defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>; defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; @@ -1389,9 +1380,9 @@ def HWREG { } class getHwRegImm { - int ret = !or(Reg, - !or(!shl(Offset, 6), - !shl(!add(Size, -1), 11))); + int ret = !and(!or(Reg, + !or(!shl(Offset, 6), + !shl(!add(Size, -1), 11))), 65535); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 36d52ac3ee891..140e1f0e122a9 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -122,6 +122,19 @@ class SILowerControlFlow : public MachineFunctionPass { skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const; + /// Find the insertion point for a new conditional branch. + MachineBasicBlock::iterator + skipToUncondBrOrEnd(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + assert(I->isTerminator()); + + // FIXME: What if we had multiple pre-existing conditional branches? + MachineBasicBlock::iterator End = MBB.end(); + while (I != End && !I->isUnconditionalBranch()) + ++I; + return I; + } + // Remove redundant SI_END_CF instructions. void optimizeEndCf(); @@ -275,6 +288,10 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) .addReg(Tmp, RegState::Kill); + // Skip ahead to the unconditional branch in case there are other terminators + // present. + I = skipToUncondBrOrEnd(MBB, I); + // Insert the S_CBRANCH_EXECZ instruction which will be optimized later // during SIRemoveShortExecBranches. MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) @@ -353,6 +370,10 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { .addReg(Exec) .addReg(DstReg); + // Skip ahead to the unconditional branch in case there are other terminators + // present. + ElsePt = skipToUncondBrOrEnd(MBB, ElsePt); + MachineInstr *Branch = BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) .addMBB(DestBB); @@ -435,8 +456,9 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { .addReg(Exec) .add(MI.getOperand(0)); + auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator()); MachineInstr *Branch = - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) .add(MI.getOperand(1)); if (LIS) { diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 1349d3b6bf3f6..a7e963e043f46 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -233,10 +233,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { // Find lowest available VGPR and use it as VGPR reserved for SGPR spills. static bool lowerShiftReservedVGPR(MachineFunction &MF, const GCNSubtarget &ST) { + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill; + // Early out if pre-reservation of a VGPR for SGPR spilling is disabled. + if (!PreReservedVGPR) + return false; + + // If there are no free lower VGPRs available, default to using the + // pre-reserved register instead. + Register LowestAvailableVGPR = PreReservedVGPR; + MachineRegisterInfo &MRI = MF.getRegInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - Register LowestAvailableVGPR, ReservedVGPR; ArrayRef AllVGPR32s = ST.getRegisterInfo()->getAllVGPR32(MF); for (MCPhysReg Reg : AllVGPR32s) { if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) { @@ -245,26 +253,29 @@ static bool lowerShiftReservedVGPR(MachineFunction &MF, } } - if (!LowestAvailableVGPR) - return false; - - ReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill; const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); - int i = 0; + Optional FI; + // Check if we are reserving a CSR. Create a stack object for a possible spill + // in the function prologue. + if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR)) + FI = FrameInfo.CreateSpillStackObject(4, Align(4)); + + // Find saved info about the pre-reserved register. + const auto *ReservedVGPRInfoItr = + std::find_if(FuncInfo->getSGPRSpillVGPRs().begin(), + FuncInfo->getSGPRSpillVGPRs().end(), + [PreReservedVGPR](const auto &SpillRegInfo) { + return SpillRegInfo.VGPR == PreReservedVGPR; + }); + + assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end()); + auto Index = + std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr); + + FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index); for (MachineBasicBlock &MBB : MF) { - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { - if (Reg.VGPR == ReservedVGPR) { - MBB.removeLiveIn(ReservedVGPR); - MBB.addLiveIn(LowestAvailableVGPR); - Optional FI; - if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR)) - FI = FrameInfo.CreateSpillStackObject(4, Align(4)); - - FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, i); - } - ++i; - } + MBB.addLiveIn(LowestAvailableVGPR); MBB.sortUniqueLiveIns(); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 788e9873f780f..441bad07f89f2 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -546,6 +546,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), + HasSpilledSGPRs(MFI.hasSpilledSGPRs()), + HasSpilledVGPRs(MFI.hasSpilledVGPRs()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), @@ -567,6 +569,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; MemoryBound = YamlMFI.MemoryBound; WaveLimiter = YamlMFI.WaveLimiter; + HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; + HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; return false; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 57a89d9acc815..d9a2d3abb0b88 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -281,6 +281,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool NoSignedZerosFPMath = false; bool MemoryBound = false; bool WaveLimiter = false; + bool HasSpilledSGPRs = false; + bool HasSpilledVGPRs = false; uint32_t HighBitsOf32BitAddress = 0; StringValue ScratchRSrcReg = "$private_rsrc_reg"; @@ -308,6 +310,8 @@ template <> struct MappingTraits { YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false); YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false); YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); + YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false); + YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 4e6c72ca20e28..21419aab1a43f 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -47,6 +47,10 @@ using namespace llvm::AMDGPU; #define DEBUG_TYPE "si-memory-legalizer" #define PASS_NAME "SI Memory Legalizer" +static cl::opt AmdgcnSkipCacheInvalidations( + "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, + cl::desc("Use this to skip inserting cache invalidating instructions.")); + namespace { LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); @@ -254,7 +258,7 @@ class SICacheControl { IsaVersion IV; - /// Whether to insert cache invalidation instructions. + /// Whether to insert cache invalidating instructions. bool InsertCacheInv; SICacheControl(const GCNSubtarget &ST); @@ -653,7 +657,7 @@ Optional SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( SICacheControl::SICacheControl(const GCNSubtarget &ST) { TII = ST.getInstrInfo(); IV = getIsaVersion(ST.getCPU()); - InsertCacheInv = !ST.isAmdPalOS(); + InsertCacheInv = !AmdgcnSkipCacheInvalidations; } /* static */ diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index a9717c6ffb705..449b6287a87b7 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -176,8 +176,7 @@ static unsigned getSaveExecOp(unsigned Opc) { } // These are only terminators to get correct spill code placement during -// register allocation, so turn them back into normal instructions. Only one of -// these is expected per block. +// register allocation, so turn them back into normal instructions. static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { switch (MI.getOpcode()) { case AMDGPU::S_MOV_B64_term: @@ -220,19 +219,29 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { } } +// Turn all pseudoterminators in the block into their equivalent non-terminator +// instructions. Returns the reverse iterator to the first non-terminator +// instruction in the block. static MachineBasicBlock::reverse_iterator fixTerminators( const SIInstrInfo &TII, MachineBasicBlock &MBB) { MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); + + bool Seen = false; + MachineBasicBlock::reverse_iterator FirstNonTerm = I; for (; I != E; ++I) { if (!I->isTerminator()) - return I; + return Seen ? FirstNonTerm : I; - if (removeTerminatorBit(TII, *I)) - return I; + if (removeTerminatorBit(TII, *I)) { + if (!Seen) { + FirstNonTerm = I; + Seen = true; + } + } } - return E; + return FirstNonTerm; } static MachineBasicBlock::reverse_iterator findExecCopy( @@ -291,8 +300,20 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (I == E) continue; - Register CopyToExec = isCopyToExec(*I, ST); - if (!CopyToExec.isValid()) + // It's possible to see other terminator copies after the exec copy. This + // can happen if control flow pseudos had their outputs used by phis. + Register CopyToExec; + + unsigned SearchCount = 0; + const unsigned SearchLimit = 5; + while (I != E && SearchCount++ < SearchLimit) { + CopyToExec = isCopyToExec(*I, ST); + if (CopyToExec) + break; + ++I; + } + + if (!CopyToExec) continue; // Scan backwards to find the def. diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 8af00fcf62a82..74546befbb596 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -77,6 +77,32 @@ static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) { return false; } +// See if there is a def between \p AndIdx and \p SelIdx that needs to live +// beyond \p AndIdx. +static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx, + SlotIndex SelIdx) { + LiveQueryResult AndLRQ = LR.Query(AndIdx); + return (!AndLRQ.isKill() && AndLRQ.valueIn() != LR.Query(SelIdx).valueOut()); +} + +// FIXME: Why do we bother trying to handle physical registers here? +static bool isDefBetween(const SIRegisterInfo &TRI, + LiveIntervals *LIS, Register Reg, + const MachineInstr &Sel, const MachineInstr &And) { + SlotIndex AndIdx = LIS->getInstructionIndex(And); + SlotIndex SelIdx = LIS->getInstructionIndex(Sel); + + if (Reg.isVirtual()) + return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx); + + for (MCRegUnitIterator UI(Reg, &TRI); UI.isValid(); ++UI) { + if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx)) + return true; + } + + return false; +} + // Optimize sequence // %sel = V_CNDMASK_B32_e64 0, 1, %cc // %cmp = V_CMP_NE_U32 1, %1 @@ -158,10 +184,16 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, Op1->getImm() != 0 || Op2->getImm() != 1) return AMDGPU::NoRegister; + Register CCReg = CC->getReg(); + + // If there was a def between the select and the and, we would need to move it + // to fold this. + if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And)) + return AMDGPU::NoRegister; + LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t' << *And); - Register CCReg = CC->getReg(); LIS->RemoveMachineInstrFromMaps(*And); MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 956658296a94a..9263be6e1abfd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1029,13 +1029,19 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, // Mark the "old value of vgpr" input undef only if this is the first sgpr // spill to this specific vgpr in the first basic block. - BuildMI(*MBB, MI, DL, + auto MIB = BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg, getKillRegState(IsKill)) .addImm(Spill.Lane) .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); + if (i == 0 && NumSubRegs > 1) { + // We may be spilling a super-register which is only partially defined, + // and need to ensure later spills think the value is defined. + MIB.addReg(SuperReg, RegState::ImplicitDefine); + } + // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index d60fa58a0a74e..92cb2807611af 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -806,15 +806,19 @@ let hasSideEffects = 1 in { let mayLoad = 1 in { // s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow // its use in the readcyclecounter selection. +// FIXME: Need to truncate immediate to 16-bits. def S_GETREG_B32 : SOPK_Pseudo < "s_getreg_b32", (outs SReg_32:$sdst), (ins hwreg:$simm16), - "$sdst, $simm16" ->; + "$sdst, $simm16", + [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> { + let SOPKZext = 1; +} } let mayLoad = 0, mayStore =0 in { +// FIXME: Need to truncate immediate to 16-bits. def S_SETREG_B32 : SOPK_Pseudo < "s_setreg_b32", (outs), (ins SReg_32:$sdst, hwreg:$simm16), @@ -1250,14 +1254,6 @@ let SubtargetPredicate = isGFX10Plus in { SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">; } // End SubtargetPredicate = isGFX10Plus -//===----------------------------------------------------------------------===// -// S_GETREG_B32 Intrinsic Pattern. -//===----------------------------------------------------------------------===// -def : GCNPat < - (int_amdgcn_s_getreg timm:$simm16), - (S_GETREG_B32 (as_i16imm $simm16)) ->; - //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 5819a621f55d6..1d645bda4b3fa 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// #include "AMDGPUAsmUtils.h" +#include "SIDefines.h" namespace llvm { namespace AMDGPU { @@ -87,6 +88,250 @@ const char* const IdSymbolic[] = { } // namespace Hwreg +namespace MTBUFFormat { + +StringLiteral const DfmtSymbolic[] = { + "BUF_DATA_FORMAT_INVALID", + "BUF_DATA_FORMAT_8", + "BUF_DATA_FORMAT_16", + "BUF_DATA_FORMAT_8_8", + "BUF_DATA_FORMAT_32", + "BUF_DATA_FORMAT_16_16", + "BUF_DATA_FORMAT_10_11_11", + "BUF_DATA_FORMAT_11_11_10", + "BUF_DATA_FORMAT_10_10_10_2", + "BUF_DATA_FORMAT_2_10_10_10", + "BUF_DATA_FORMAT_8_8_8_8", + "BUF_DATA_FORMAT_32_32", + "BUF_DATA_FORMAT_16_16_16_16", + "BUF_DATA_FORMAT_32_32_32", + "BUF_DATA_FORMAT_32_32_32_32", + "BUF_DATA_FORMAT_RESERVED_15" +}; + +StringLiteral const NfmtSymbolicGFX10[] = { + "BUF_NUM_FORMAT_UNORM", + "BUF_NUM_FORMAT_SNORM", + "BUF_NUM_FORMAT_USCALED", + "BUF_NUM_FORMAT_SSCALED", + "BUF_NUM_FORMAT_UINT", + "BUF_NUM_FORMAT_SINT", + "", + "BUF_NUM_FORMAT_FLOAT" +}; + +StringLiteral const NfmtSymbolicSICI[] = { + "BUF_NUM_FORMAT_UNORM", + "BUF_NUM_FORMAT_SNORM", + "BUF_NUM_FORMAT_USCALED", + "BUF_NUM_FORMAT_SSCALED", + "BUF_NUM_FORMAT_UINT", + "BUF_NUM_FORMAT_SINT", + "BUF_NUM_FORMAT_SNORM_OGL", + "BUF_NUM_FORMAT_FLOAT" +}; + +StringLiteral const NfmtSymbolicVI[] = { // VI and GFX9 + "BUF_NUM_FORMAT_UNORM", + "BUF_NUM_FORMAT_SNORM", + "BUF_NUM_FORMAT_USCALED", + "BUF_NUM_FORMAT_SSCALED", + "BUF_NUM_FORMAT_UINT", + "BUF_NUM_FORMAT_SINT", + "BUF_NUM_FORMAT_RESERVED_6", + "BUF_NUM_FORMAT_FLOAT" +}; + +StringLiteral const UfmtSymbolic[] = { + "BUF_FMT_INVALID", + + "BUF_FMT_8_UNORM", + "BUF_FMT_8_SNORM", + "BUF_FMT_8_USCALED", + "BUF_FMT_8_SSCALED", + "BUF_FMT_8_UINT", + "BUF_FMT_8_SINT", + + "BUF_FMT_16_UNORM", + "BUF_FMT_16_SNORM", + "BUF_FMT_16_USCALED", + "BUF_FMT_16_SSCALED", + "BUF_FMT_16_UINT", + "BUF_FMT_16_SINT", + "BUF_FMT_16_FLOAT", + + "BUF_FMT_8_8_UNORM", + "BUF_FMT_8_8_SNORM", + "BUF_FMT_8_8_USCALED", + "BUF_FMT_8_8_SSCALED", + "BUF_FMT_8_8_UINT", + "BUF_FMT_8_8_SINT", + + "BUF_FMT_32_UINT", + "BUF_FMT_32_SINT", + "BUF_FMT_32_FLOAT", + + "BUF_FMT_16_16_UNORM", + "BUF_FMT_16_16_SNORM", + "BUF_FMT_16_16_USCALED", + "BUF_FMT_16_16_SSCALED", + "BUF_FMT_16_16_UINT", + "BUF_FMT_16_16_SINT", + "BUF_FMT_16_16_FLOAT", + + "BUF_FMT_10_11_11_UNORM", + "BUF_FMT_10_11_11_SNORM", + "BUF_FMT_10_11_11_USCALED", + "BUF_FMT_10_11_11_SSCALED", + "BUF_FMT_10_11_11_UINT", + "BUF_FMT_10_11_11_SINT", + "BUF_FMT_10_11_11_FLOAT", + + "BUF_FMT_11_11_10_UNORM", + "BUF_FMT_11_11_10_SNORM", + "BUF_FMT_11_11_10_USCALED", + "BUF_FMT_11_11_10_SSCALED", + "BUF_FMT_11_11_10_UINT", + "BUF_FMT_11_11_10_SINT", + "BUF_FMT_11_11_10_FLOAT", + + "BUF_FMT_10_10_10_2_UNORM", + "BUF_FMT_10_10_10_2_SNORM", + "BUF_FMT_10_10_10_2_USCALED", + "BUF_FMT_10_10_10_2_SSCALED", + "BUF_FMT_10_10_10_2_UINT", + "BUF_FMT_10_10_10_2_SINT", + + "BUF_FMT_2_10_10_10_UNORM", + "BUF_FMT_2_10_10_10_SNORM", + "BUF_FMT_2_10_10_10_USCALED", + "BUF_FMT_2_10_10_10_SSCALED", + "BUF_FMT_2_10_10_10_UINT", + "BUF_FMT_2_10_10_10_SINT", + + "BUF_FMT_8_8_8_8_UNORM", + "BUF_FMT_8_8_8_8_SNORM", + "BUF_FMT_8_8_8_8_USCALED", + "BUF_FMT_8_8_8_8_SSCALED", + "BUF_FMT_8_8_8_8_UINT", + "BUF_FMT_8_8_8_8_SINT", + + "BUF_FMT_32_32_UINT", + "BUF_FMT_32_32_SINT", + "BUF_FMT_32_32_FLOAT", + + "BUF_FMT_16_16_16_16_UNORM", + "BUF_FMT_16_16_16_16_SNORM", + "BUF_FMT_16_16_16_16_USCALED", + "BUF_FMT_16_16_16_16_SSCALED", + "BUF_FMT_16_16_16_16_UINT", + "BUF_FMT_16_16_16_16_SINT", + "BUF_FMT_16_16_16_16_FLOAT", + + "BUF_FMT_32_32_32_UINT", + "BUF_FMT_32_32_32_SINT", + "BUF_FMT_32_32_32_FLOAT", + "BUF_FMT_32_32_32_32_UINT", + "BUF_FMT_32_32_32_32_SINT", + "BUF_FMT_32_32_32_32_FLOAT" +}; + +unsigned const DfmtNfmt2UFmt[] = { + DFMT_INVALID | (NFMT_UNORM << NFMT_SHIFT), + + DFMT_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_8_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_16_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_10_11_11 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_UINT << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_SINT << NFMT_SHIFT), + DFMT_10_11_11 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_11_11_10 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_UINT << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_SINT << NFMT_SHIFT), + DFMT_11_11_10 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_10_10_10_2 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_UINT << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_2_10_10_10 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_UINT << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_8_8_8_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_16_16_16_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_32_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT) +}; + +} // namespace MTBUFFormat + namespace Swizzle { // This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index cd91c5f6edd56..a6053735d7956 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -9,8 +9,11 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H +#include "llvm/ADT/StringRef.h" + namespace llvm { namespace AMDGPU { + namespace SendMsg { // Symbolic names for the sendmsg(...) syntax. extern const char* const IdSymbolic[]; @@ -25,6 +28,17 @@ extern const char* const IdSymbolic[]; } // namespace Hwreg +namespace MTBUFFormat { + +extern StringLiteral const DfmtSymbolic[]; +extern StringLiteral const NfmtSymbolicGFX10[]; +extern StringLiteral const NfmtSymbolicSICI[]; +extern StringLiteral const NfmtSymbolicVI[]; +extern StringLiteral const UfmtSymbolic[]; +extern unsigned const DfmtNfmt2UFmt[]; + +} // namespace MTBUFFormat + namespace Swizzle { // Symbolic names for the swizzle(...) syntax. extern const char* const IdSymbolic[]; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b89e34e4c99c1..497a04e25e628 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -789,6 +789,52 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) namespace MTBUFFormat { +int64_t getDfmt(const StringRef Name) { + for (int Id = DFMT_MIN; Id <= DFMT_MAX; ++Id) { + if (Name == DfmtSymbolic[Id]) + return Id; + } + return DFMT_UNDEF; +} + +StringRef getDfmtName(unsigned Id) { + assert(Id <= DFMT_MAX); + return DfmtSymbolic[Id]; +} + +static StringLiteral const *getNfmtLookupTable(const MCSubtargetInfo &STI) { + if (isSI(STI) || isCI(STI)) + return NfmtSymbolicSICI; + if (isVI(STI) || isGFX9(STI)) + return NfmtSymbolicVI; + return NfmtSymbolicGFX10; +} + +int64_t getNfmt(const StringRef Name, const MCSubtargetInfo &STI) { + auto lookupTable = getNfmtLookupTable(STI); + for (int Id = NFMT_MIN; Id <= NFMT_MAX; ++Id) { + if (Name == lookupTable[Id]) + return Id; + } + return NFMT_UNDEF; +} + +StringRef getNfmtName(unsigned Id, const MCSubtargetInfo &STI) { + assert(Id <= NFMT_MAX); + return getNfmtLookupTable(STI)[Id]; +} + +bool isValidDfmtNfmt(unsigned Id, const MCSubtargetInfo &STI) { + unsigned Dfmt; + unsigned Nfmt; + decodeDfmtNfmt(Id, Dfmt, Nfmt); + return isValidNfmt(Nfmt, STI); +} + +bool isValidNfmt(unsigned Id, const MCSubtargetInfo &STI) { + return !getNfmtName(Id, STI).empty(); +} + int64_t encodeDfmtNfmt(unsigned Dfmt, unsigned Nfmt) { return (Dfmt << DFMT_SHIFT) | (Nfmt << NFMT_SHIFT); } @@ -798,6 +844,41 @@ void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt) { Nfmt = (Format >> NFMT_SHIFT) & NFMT_MASK; } +int64_t getUnifiedFormat(const StringRef Name) { + for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) { + if (Name == UfmtSymbolic[Id]) + return Id; + } + return UFMT_UNDEF; +} + +StringRef getUnifiedFormatName(unsigned Id) { + return isValidUnifiedFormat(Id) ? UfmtSymbolic[Id] : ""; +} + +bool isValidUnifiedFormat(unsigned Id) { + return Id <= UFMT_LAST; +} + +int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt) { + int64_t Fmt = encodeDfmtNfmt(Dfmt, Nfmt); + for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) { + if (Fmt == DfmtNfmt2UFmt[Id]) + return Id; + } + return UFMT_UNDEF; +} + +bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI) { + return isGFX10(STI) ? (Val <= UFMT_MAX) : (Val <= DFMT_NFMT_MAX); +} + +unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI) { + if (isGFX10(STI)) + return UFMT_DEFAULT; + return DFMT_NFMT_DEFAULT; +} + } // namespace MTBUFFormat //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a9ea05755a676..d250cc2ec03db 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -489,6 +489,30 @@ int64_t encodeDfmtNfmt(unsigned Dfmt, unsigned Nfmt); void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt); +int64_t getDfmt(const StringRef Name); + +StringRef getDfmtName(unsigned Id); + +int64_t getNfmt(const StringRef Name, const MCSubtargetInfo &STI); + +StringRef getNfmtName(unsigned Id, const MCSubtargetInfo &STI); + +bool isValidDfmtNfmt(unsigned Val, const MCSubtargetInfo &STI); + +bool isValidNfmt(unsigned Val, const MCSubtargetInfo &STI); + +int64_t getUnifiedFormat(const StringRef Name); + +StringRef getUnifiedFormatName(unsigned Id); + +bool isValidUnifiedFormat(unsigned Val); + +int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt); + +bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI); + +unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI); + } // namespace MTBUFFormat namespace SendMsg { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index d9dcfdca23b35..3451c23891811 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -828,6 +828,24 @@ def : GCNPat < } // End Predicates = [Has16BitInsts] +let SubtargetPredicate = HasIntClamp in { +// Set clamp bit for saturation. +def : VOPBinOpClampPat; +def : VOPBinOpClampPat; +} + +let SubtargetPredicate = HasAddNoCarryInsts, OtherPredicates = [HasIntClamp] in { +let AddedComplexity = 1 in { // Prefer over form with carry-out. +def : VOPBinOpClampPat; +def : VOPBinOpClampPat; +} +} + +let SubtargetPredicate = Has16BitInsts, OtherPredicates = [HasIntClamp] in { +def : VOPBinOpClampPat; +def : VOPBinOpClampPat; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index dcbfeb547a32d..3048bcc610c76 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -667,6 +667,20 @@ def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; +def : VOPBinOpClampPat; +def : VOPBinOpClampPat; + + +// FIXME: Probably should hardcode clamp bit in pseudo and avoid this. +class OpSelBinOpClampPat : GCNPat< + (node (i16 (VOP3OpSel i16:$src0, i32:$src0_modifiers)), + (i16 (VOP3OpSel i16:$src1, i32:$src1_modifiers))), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE, 0) +>; + +def : OpSelBinOpClampPat; +def : OpSelBinOpClampPat; } // End SubtargetPredicate = isGFX9Plus def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index fc457ad212d48..446e87ab3fc98 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -77,6 +77,8 @@ def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, lshr_rev>; +let SubtargetPredicate = HasVOP3PInsts in { + // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. // The constant will be emitted as a mov, and folded later. @@ -86,6 +88,19 @@ def : GCNPat< (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) >; +// Integer operations with clamp bit set. +class VOP3PSatPat : GCNPat< + (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), + (v2i16 (VOP3PMods v2i16:$src1, i32:$src1_modifiers))), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE) +>; + +def : VOP3PSatPat; +def : VOP3PSatPat; +def : VOP3PSatPat; +def : VOP3PSatPat; +} // End SubtargetPredicate = HasVOP3PInsts + multiclass MadFmaMixPats { !if(!isa(Op), getDivergentFrag.ret, Op), Op); } +class getVSrcOp { + RegisterOperand ret = !if(!eq(vt.Size, 32), VSrc_b32, VSrc_b16); +} + +// Class for binary integer operations with the clamp bit set for saturation +// TODO: Add sub with negated inline constant pattern. +class VOPBinOpClampPat : + GCNPat<(node vt:$src0, vt:$src1), + (inst getVSrcOp.ret:$src0, getVSrcOp.ret:$src1, + DSTCLAMP.ENABLE) +>; + + include "VOPCInstructions.td" include "VOP1Instructions.td" include "VOP2Instructions.td" diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 4cc2b6bf7e7e0..0353cfd3d86f7 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -537,6 +537,18 @@ bool ARMBaseInstrInfo::PredicateInstruction( MachineOperand &PMO = MI.getOperand(PIdx); PMO.setImm(Pred[0].getImm()); MI.getOperand(PIdx+1).setReg(Pred[1].getReg()); + + // Thumb 1 arithmetic instructions do not set CPSR when executed inside an + // IT block. This affects how they are printed. + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) { + assert(MCID.OpInfo[1].isOptionalDef() && "CPSR def isn't expected operand"); + assert((MI.getOperand(1).isDead() || + MI.getOperand(1).getReg() != ARM::CPSR) && + "if conversion tried to stop defining used CPSR"); + MI.getOperand(1).setReg(ARM::NoRegister); + } + return true; } return false; @@ -575,6 +587,13 @@ bool ARMBaseInstrInfo::DefinesPredicate( const MachineOperand &MO = MI.getOperand(i); if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) || (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) { + + // Filter out T1 instructions that have a dead CPSR, + // allowing IT blocks to be generated containing T1 instructions + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.TSFlags & ARMII::ThumbArithFlagSetting && MO.isDead()) + continue; + Pred.push_back(MO); Found = true; } @@ -5501,6 +5520,8 @@ unsigned llvm::ConstantMaterializationCost(unsigned Val, return ForCodesize ? 4 : 1; if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs return ForCodesize ? 8 : 2; + if (ARM_AM::isSOImmTwoPartValNeg(Val)) // two instrs + return ForCodesize ? 8 : 2; } if (Subtarget->useMovt()) // MOVW + MOVT return ForCodesize ? 8 : 2; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 1a75b011ca59d..f71445cf59c35 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -829,6 +829,10 @@ inline bool isLegalAddressImm(unsigned Opcode, int Imm, return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0; case ARMII::AddrModeT2_i7s4: return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0; + case ARMII::AddrModeT2_i8: + return std::abs(Imm) < (((1 << 8) * 1) - 1); + case ARMII::AddrModeT2_i12: + return Imm >= 0 && Imm < (((1 << 12) * 1) - 1); default: llvm_unreachable("Unhandled Addressing mode"); } diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index ec72c2b5ac194..9be0b6fa33e93 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -873,16 +873,27 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, // FIXME Windows CE supports older ARM CPUs assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+"); - // Expand into a movi + orr. - LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg); - HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg); - assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!"); unsigned ImmVal = (unsigned)MO.getImm(); - unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); - unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + unsigned SOImmValV1 = 0, SOImmValV2 = 0; + + if (ARM_AM::isSOImmTwoPartVal(ImmVal)) { // Expand into a movi + orr. + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); + SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + } else { // Expand into a mvn + sub. + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(-ImmVal); + SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(-ImmVal); + SOImmValV1 = ~(-SOImmValV1); + } + unsigned MIFlags = MI.getFlags(); LO16 = LO16.addImm(SOImmValV1); HI16 = HI16.addImm(SOImmValV2); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 1428600ca5240..f5bb097062aff 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -528,12 +528,6 @@ class VectorType; const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent = false) const override; - /// Returns true if a cast between SrcAS and DestAS is a noop. - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { - // Addrspacecasts are always noops. - return true; - } - bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, unsigned &PrefAlign) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td index e13f3437cc7bd..85da7c5a535e9 100644 --- a/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -403,8 +403,9 @@ class InstTemplategetTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32); }]>; +// asr_imm_XFORM - Returns a shift immediate with bit {5} set to 1 +def asr_imm_XFORM : SDNodeXFormgetTargetConstant(0x20 | N->getZExtValue(), SDLoc(N), MVT:: i32); +}]>; + /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. def imm16_31 : ImmLeaf= 16 && (int32_t)Imm < 32; @@ -446,6 +451,8 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ def imm_even : ImmLeaf; def imm_odd : ImmLeaf; +def asr_imm : ImmLeaf 0 && Imm <= 32; }], asr_imm_XFORM>; + //===----------------------------------------------------------------------===// // NEON/MVE pattern fragments // @@ -824,7 +831,9 @@ def mod_imm_neg : Operand, PatLeaf<(imm), [{ def arm_i32imm : IntImmLeafuseMovt()) return true; - return ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue()); + if (ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue())) + return true; + return ARM_AM::isSOImmTwoPartValNeg(Imm.getZExtValue()); }]>; /// imm0_1 predicate - Immediate in the range [0,1]. @@ -4081,6 +4090,15 @@ def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos), (SSAT16 imm1_16:$pos, GPRnopc:$a)>; def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos), (USAT16 imm0_15:$pos, GPRnopc:$a)>; +def : ARMV6Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos), + (SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : ARMV6Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos), + (SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>; +def : ARMV6Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : ARMV6Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>; + //===----------------------------------------------------------------------===// // Bitwise Instructions. diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index dc701412b5950..b082ca4d1c903 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3773,7 +3773,14 @@ multiclass MVE_VABD_fp_m : MVE_VABDT_fp_m; defm MVE_VABDf32 : MVE_VABD_fp_m; -defm MVE_VABDf16 : MVE_VABD_fp_m; +defm MVE_VABDf16 : MVE_VABD_fp_m; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (fabs (fsub (v8f16 MQPR:$Qm), (v8f16 MQPR:$Qn)))), + (MVE_VABDf16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4f32 (fabs (fsub (v4f32 MQPR:$Qm), (v4f32 MQPR:$Qn)))), + (MVE_VABDf32 MQPR:$Qm, MQPR:$Qn)>; +} class MVE_VCVT_fix diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index d5143adaac179..fc92230ac81df 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -2636,6 +2636,15 @@ def : T2Pat<(int_arm_ssat16 GPR:$a, imm1_16:$pos), (t2SSAT16 imm1_16:$pos, GPR:$a)>; def : T2Pat<(int_arm_usat16 GPR:$a, imm0_15:$pos), (t2USAT16 imm0_15:$pos, GPR:$a)>; +def : T2Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos), + (t2SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : T2Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos), + (t2SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>; +def : T2Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), + (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : T2Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos), + (t2USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>; + //===----------------------------------------------------------------------===// // Shift and rotate Instructions. diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index a84d23d3bb96a..09bb3b3c6f728 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1382,9 +1382,27 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, case ARM::t2LDRi8: case ARM::t2LDRi12: return ARM::t2LDR_POST; + case ARM::t2LDRBi8: + case ARM::t2LDRBi12: + return ARM::t2LDRB_POST; + case ARM::t2LDRSBi8: + case ARM::t2LDRSBi12: + return ARM::t2LDRSB_POST; + case ARM::t2LDRHi8: + case ARM::t2LDRHi12: + return ARM::t2LDRH_POST; + case ARM::t2LDRSHi8: + case ARM::t2LDRSHi12: + return ARM::t2LDRSH_POST; case ARM::t2STRi8: case ARM::t2STRi12: return ARM::t2STR_POST; + case ARM::t2STRBi8: + case ARM::t2STRBi12: + return ARM::t2STRB_POST; + case ARM::t2STRHi8: + case ARM::t2STRHi12: + return ARM::t2STRH_POST; case ARM::MVE_VLDRBS16: return ARM::MVE_VLDRBS16_post; @@ -2539,11 +2557,94 @@ static int getBaseOperandIndex(MachineInstr &MI) { case ARM::MVE_VSTRBU8: case ARM::MVE_VSTRHU16: case ARM::MVE_VSTRWU32: + case ARM::t2LDRHi8: + case ARM::t2LDRHi12: + case ARM::t2LDRSHi8: + case ARM::t2LDRSHi12: + case ARM::t2LDRBi8: + case ARM::t2LDRBi12: + case ARM::t2LDRSBi8: + case ARM::t2LDRSBi12: + case ARM::t2STRBi8: + case ARM::t2STRBi12: + case ARM::t2STRHi8: + case ARM::t2STRHi12: return 1; } return -1; } +// Given a memory access Opcode, check that the give Imm would be a valid Offset +// for this instruction (same as isLegalAddressImm), Or if the instruction +// could be easily converted to one where that was valid. For example converting +// t2LDRi12 to t2LDRi8 for negative offsets. Works in conjunction with +// AdjustBaseAndOffset below. +static bool isLegalOrConvertableAddressImm(unsigned Opcode, int Imm, + const TargetInstrInfo *TII, + int &CodesizeEstimate) { + if (isLegalAddressImm(Opcode, Imm, TII)) + return true; + + // We can convert AddrModeT2_i12 to AddrModeT2_i8. + const MCInstrDesc &Desc = TII->get(Opcode); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + switch (AddrMode) { + case ARMII::AddrModeT2_i12: + CodesizeEstimate += 1; + return std::abs(Imm) < (((1 << 8) * 1) - 1); + } + return false; +} + +// Given an MI adjust its address BaseReg to use NewBaseReg and address offset +// by -Offset. This can either happen in-place or be a replacement as MI is +// converted to another instruction type. +static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg, + int Offset, const TargetInstrInfo *TII) { + unsigned BaseOp = getBaseOperandIndex(*MI); + MI->getOperand(BaseOp).setReg(NewBaseReg); + int OldOffset = MI->getOperand(BaseOp + 1).getImm(); + if (isLegalAddressImm(MI->getOpcode(), OldOffset - Offset, TII)) + MI->getOperand(BaseOp + 1).setImm(OldOffset - Offset); + else { + unsigned ConvOpcode; + switch (MI->getOpcode()) { + case ARM::t2LDRHi12: + ConvOpcode = ARM::t2LDRHi8; + break; + case ARM::t2LDRSHi12: + ConvOpcode = ARM::t2LDRSHi8; + break; + case ARM::t2LDRBi12: + ConvOpcode = ARM::t2LDRBi8; + break; + case ARM::t2LDRSBi12: + ConvOpcode = ARM::t2LDRSBi8; + break; + case ARM::t2STRHi12: + ConvOpcode = ARM::t2STRHi8; + break; + case ARM::t2STRBi12: + ConvOpcode = ARM::t2STRBi8; + break; + default: + llvm_unreachable("Unhandled convertable opcode"); + } + assert(isLegalAddressImm(ConvOpcode, OldOffset - Offset, TII) && + "Illegal Address Immediate after convert!"); + + const MCInstrDesc &MCID = TII->get(ConvOpcode); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(OldOffset - Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); + MI->eraseFromParent(); + } +} + static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, Register NewReg, const TargetInstrInfo *TII, @@ -2562,14 +2663,43 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, TRC = TII->getRegClass(MCID, 2, TRI, *MF); MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC); - return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) - .addReg(NewReg, RegState::Define) - .add(MI->getOperand(0)) - .add(MI->getOperand(1)) - .addImm(Offset) - .add(MI->getOperand(3)) - .add(MI->getOperand(4)) - .cloneMemRefs(*MI); + unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask); + switch (AddrMode) { + case ARMII::AddrModeT2_i7: + case ARMII::AddrModeT2_i7s2: + case ARMII::AddrModeT2_i7s4: + // Any MVE load/store + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); + case ARMII::AddrModeT2_i8: + if (MI->mayLoad()) { + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .add(MI->getOperand(0)) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); + } else { + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); + } + default: + llvm_unreachable("Unhandled createPostIncLoadStore"); + } } // Given a Base Register, optimise the load/store uses to attempt to create more @@ -2589,7 +2719,7 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { // An increment that can be folded in MachineInstr *Increment = nullptr; // Other accesses after BaseAccess that will need to be updated to use the - // postinc value + // postinc value. SmallPtrSet OtherAccesses; for (auto &Use : MRI->use_nodbg_instructions(Base)) { if (!Increment && getAddSubImmediate(Use) != 0) { @@ -2643,14 +2773,20 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { // other offsets after the BaseAccess. We rely on either // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess) // to keep things simple. + // This also adds a simple codesize metric, to detect if an instruction (like + // t2LDRBi12) which can often be shrunk to a thumb1 instruction (tLDRBi) + // cannot because it is converted to something else (t2LDRBi8). We start this + // at -1 for the gain from removing the increment. SmallPtrSet SuccessorAccesses; + int CodesizeEstimate = -1; for (auto *Use : OtherAccesses) { if (DT->dominates(BaseAccess, Use)) { SuccessorAccesses.insert(Use); unsigned BaseOp = getBaseOperandIndex(*Use); - if (!isLegalAddressImm( - Use->getOpcode(), - Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) { + if (!isLegalOrConvertableAddressImm(Use->getOpcode(), + Use->getOperand(BaseOp + 1).getImm() - + IncrementOffset, + TII, CodesizeEstimate)) { LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n"); return false; } @@ -2660,6 +2796,10 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { return false; } } + if (STI->hasMinSize() && CodesizeEstimate > 0) { + LLVM_DEBUG(dbgs() << " Expected to grow instructions under minsize\n"); + return false; + } // Replace BaseAccess with a post inc LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump()); @@ -2674,10 +2814,7 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { for (auto *Use : SuccessorAccesses) { LLVM_DEBUG(dbgs() << "Changing: "; Use->dump()); - unsigned BaseOp = getBaseOperandIndex(*Use); - Use->getOperand(BaseOp).setReg(NewBaseReg); - int OldOffset = Use->getOperand(BaseOp + 1).getImm(); - Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset); + AdjustBaseAndOffset(Use, NewBaseReg, IncrementOffset, TII); LLVM_DEBUG(dbgs() << " To : "; Use->dump()); } diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index b316b1041f2c5..87a106474c5ec 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -409,7 +409,8 @@ void ARMPassConfig::addIRPasses() { // ldrex/strex loops to simplify this, but it needs tidying up. if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass( - SimplifyCFGOptions().sinkCommonInsts(true), [this](const Function &F) { + SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true), + [this](const Function &F) { const auto &ST = this->TM->getSubtarget(F); return ST.hasAnyDataBarrier() && !ST.isThumb1Only(); })); diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h index ac55d2bdcc2b3..8428092bf1794 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.h +++ b/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -72,6 +72,12 @@ class ARMBaseTargetMachine : public LLVMTargetMachine { } bool targetSchedulesPostRAScheduling() const override { return true; }; + + /// Returns true if a cast between SrcAS and DestAS is a noop. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } }; /// ARM/Thumb little endian target machine. diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 25d0a4c21fa6f..68767398191aa 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -166,7 +166,7 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (auto *CI = dyn_cast(XorMask)) { if (CI->getValue().trunc(16).isAllOnesValue()) { auto TrueVector = IC.Builder.CreateVectorSplat( - cast(II.getType())->getNumElements(), + cast(II.getType())->getNumElements(), IC.Builder.getTrue()); return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); } @@ -301,6 +301,7 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im } int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -312,15 +313,35 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Cost == 0 ? 0 : 1; return Cost; }; + auto IsLegalFPType = [this](EVT VT) { + EVT EltVT = VT.getScalarType(); + return (EltVT == MVT::f32 && ST->hasVFP2Base()) || + (EltVT == MVT::f64 && ST->hasFP64()) || + (EltVT == MVT::f16 && ST->hasFullFP16()); + }; EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); - - // The extend of a load is free - if (I && isa(I->getOperand(0))) { + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + + // Extending masked load/Truncating masked stores is expensive because we + // currently don't split them. This means that we'll likely end up + // loading/storing each element individually (hence the high cost). + if ((ST->hasMVEIntegerOps() && + (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || + Opcode == Instruction::SExt)) || + (ST->hasMVEFloatOps() && + (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && + IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) + if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) + return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor(); + + // The extend of other kinds of load is free + if (CCH == TTI::CastContextHint::Normal || + CCH == TTI::CastContextHint::Masked) { static const TypeConversionCostTblEntry LoadConversionTbl[] = { {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, @@ -374,11 +395,9 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } - } - // The truncate of a store is free. This is the mirror of extends above. - if (I && I->hasOneUse() && isa(*I->user_begin())) { - static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { + // The truncate of a store is free. This is the mirror of extends above. + static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, @@ -388,19 +407,19 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, }; if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { if (const auto *Entry = - ConvertCostTableLookup(MVELoadConversionTbl, ISD, SrcTy.getSimpleVT(), - DstTy.getSimpleVT())) + ConvertCostTableLookup(MVEStoreConversionTbl, ISD, + SrcTy.getSimpleVT(), DstTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } - static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { + static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, }; if (SrcTy.isVector() && ST->hasMVEFloatOps()) { if (const auto *Entry = - ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(), - DstTy.getSimpleVT())) + ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, + SrcTy.getSimpleVT(), DstTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } } @@ -636,14 +655,8 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, int Lanes = 1; if (SrcTy.isFixedLengthVector()) Lanes = SrcTy.getVectorNumElements(); - auto IsLegal = [this](EVT VT) { - EVT EltVT = VT.getScalarType(); - return (EltVT == MVT::f32 && ST->hasVFP2Base()) || - (EltVT == MVT::f64 && ST->hasFP64()) || - (EltVT == MVT::f16 && ST->hasFullFP16()); - }; - if (IsLegal(SrcTy) && IsLegal(DstTy)) + if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) return Lanes; else return Lanes * CallCost; @@ -672,7 +685,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, ? ST->getMVEVectorCostFactor() : 1; return AdjustCost( - BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); + BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 093dfbbf5f026..ac7d0378d90bd 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -210,7 +210,7 @@ class ARMTTIImpl : public BasicTTIImplBase { } int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h index 24a9fabf09798..dca252a177f25 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h @@ -205,6 +205,20 @@ namespace ARM_AM { return V; } + /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained + /// by two SOImmVal, that -V = First + Second. + /// "R+V" can be optimized to (sub (sub R, First), Second). + /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second). + inline bool isSOImmTwoPartValNeg(unsigned V) { + unsigned First; + if (!isSOImmTwoPartVal(-V)) + return false; + // Return false if ~(-First) is not a SoImmval. + First = getSOImmTwoPartFirst(-V); + First = ~(-First); + return !(rotr32(~255U, getSOImmValRotate(First)) & First); + } + /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed /// by a left shift. Returns the shift amount to use. inline unsigned getThumbImmValShift(unsigned Imm) { diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index 4d7ad6cd60cb9..d037fe7537d20 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -44,7 +44,7 @@ using namespace llvm; -#define DEBUG_TYPE "mve-gather-scatter-lowering" +#define DEBUG_TYPE "arm-mve-gather-scatter-lowering" cl::opt EnableMaskedGatherScatters( "enable-arm-maskedgatscat", cl::Hidden, cl::init(false), @@ -84,7 +84,7 @@ class MVEGatherScatterLowering : public FunctionPass { // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument - Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP, + Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder); // Compute the scale of this gather/scatter instruction int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); @@ -132,6 +132,11 @@ class MVEGatherScatterLowering : public FunctionPass { Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr, Value *Ptr, unsigned TypeScale, IRBuilder<> &Builder); + + // Optimise the base and offsets of the given address + bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI); + // Try to fold consecutive geps together into one + Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); // Pushes the given add out of the loop @@ -167,7 +172,49 @@ bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements, return false; } -Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, +static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) { + // Offsets that are not of type are sign extended by the + // getelementptr instruction, and MVE gathers/scatters treat the offset as + // unsigned. Thus, if the element size is smaller than 32, we can only allow + // positive offsets - i.e., the offsets are not allowed to be variables we + // can't look into. + // Additionally, offsets have to either originate from a zext of a + // vector with element types smaller or equal the type of the gather we're + // looking at, or consist of constants that we can check are small enough + // to fit into the gather type. + // Thus we check that 0 < value < 2^TargetElemSize. + unsigned TargetElemSize = 128 / TargetElemCount; + unsigned OffsetElemSize = cast(Offsets->getType()) + ->getElementType() + ->getScalarSizeInBits(); + if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) { + Constant *ConstOff = dyn_cast(Offsets); + if (!ConstOff) + return false; + int64_t TargetElemMaxSize = (1ULL << TargetElemSize); + auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) { + ConstantInt *OConst = dyn_cast(OffsetElem); + if (!OConst) + return false; + int SExtValue = OConst->getSExtValue(); + if (SExtValue >= TargetElemMaxSize || SExtValue < 0) + return false; + return true; + }; + if (isa(ConstOff->getType())) { + for (unsigned i = 0; i < TargetElemCount; i++) { + if (!CheckValueSize(ConstOff->getAggregateElement(i))) + return false; + } + } else { + if (!CheckValueSize(ConstOff)) + return false; + } + } + return true; +} + +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder) { if (!GEP) { @@ -178,40 +225,43 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found." << " Looking at intrinsic for base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); - if (GEPPtr->getType()->isVectorTy()) { + Offsets = GEP->getOperand(1); + if (GEPPtr->getType()->isVectorTy() || + !isa(Offsets->getType())) return nullptr; - } + if (GEP->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many" << " operands. Expanding.\n"); return nullptr; } Offsets = GEP->getOperand(1); + unsigned OffsetsElemCount = + cast(Offsets->getType())->getNumElements(); // Paranoid check whether the number of parallel lanes is the same - assert(cast(Ty)->getNumElements() == - cast(Offsets->getType())->getNumElements()); - // Only offsets can be integrated into an arm gather, any smaller - // type would have to be sign extended by the gep - and arm gathers can only - // zero extend. Additionally, the offsets do have to originate from a zext of - // a vector with element types smaller or equal the type of the gather we're - // looking at - if (Offsets->getType()->getScalarSizeInBits() != 32) - return nullptr; - if (ZExtInst *ZextOffs = dyn_cast(Offsets)) + assert(Ty->getNumElements() == OffsetsElemCount); + + ZExtInst *ZextOffs = dyn_cast(Offsets); + if (ZextOffs) Offsets = ZextOffs->getOperand(0); - else if (!(cast(Offsets->getType())->getNumElements() == 4 && - Offsets->getType()->getScalarSizeInBits() == 32)) - return nullptr; + FixedVectorType *OffsetType = cast(Offsets->getType()); + + // If the offsets are already being zext-ed to , that relieves us of + // having to make sure that they won't overflow. + if (!ZextOffs || cast(ZextOffs->getDestTy()) + ->getElementType() + ->getScalarSizeInBits() != 32) + if (!checkOffsetSize(Offsets, OffsetsElemCount)) + return nullptr; + // The offset sizes have been checked; if any truncating or zext-ing is + // required to fix them, do that now if (Ty != Offsets->getType()) { - if ((Ty->getScalarSizeInBits() < - Offsets->getType()->getScalarSizeInBits())) { - LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type." - << " Can't create intrinsic.\n"); - return nullptr; + if ((Ty->getElementType()->getScalarSizeInBits() < + OffsetType->getElementType()->getScalarSizeInBits())) { + Offsets = Builder.CreateTrunc(Offsets, Ty); } else { - Offsets = Builder.CreateZExt( - Offsets, VectorType::getInteger(cast(Ty))); + Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty)); } } // If none of the checks failed, return the gep's base pointer @@ -426,7 +476,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast(ResultTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -566,7 +617,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast(InputTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -978,6 +1030,127 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB, return true; } +static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, + IRBuilder<> &Builder) { + // Splat the non-vector value to a vector of the given type - if the value is + // a constant (and its value isn't too big), we can even use this opportunity + // to scale it to the size of the vector elements + auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) { + ConstantInt *Const; + if ((Const = dyn_cast(NonVectorVal)) && + VT->getElementType() != NonVectorVal->getType()) { + unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits(); + uint64_t N = Const->getZExtValue(); + if (N < (unsigned)(1 << (TargetElemSize - 1))) { + NonVectorVal = Builder.CreateVectorSplat( + VT->getNumElements(), Builder.getIntN(TargetElemSize, N)); + return; + } + } + NonVectorVal = + Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal); + }; + + FixedVectorType *XElType = dyn_cast(X->getType()); + FixedVectorType *YElType = dyn_cast(Y->getType()); + // If one of X, Y is not a vector, we have to splat it in order + // to add the two of them. + if (XElType && !YElType) { + FixSummands(XElType, Y); + YElType = cast(Y->getType()); + } else if (YElType && !XElType) { + FixSummands(YElType, X); + XElType = cast(X->getType()); + } + // Check that the summands are of compatible types + if (XElType != YElType) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n"); + return nullptr; + } + + if (XElType->getElementType()->getScalarSizeInBits() != 32) { + // Check that by adding the vectors we do not accidentally + // create an overflow + Constant *ConstX = dyn_cast(X); + Constant *ConstY = dyn_cast(Y); + if (!ConstX || !ConstY) + return nullptr; + unsigned TargetElemSize = 128 / XElType->getNumElements(); + for (unsigned i = 0; i < XElType->getNumElements(); i++) { + ConstantInt *ConstXEl = + dyn_cast(ConstX->getAggregateElement(i)); + ConstantInt *ConstYEl = + dyn_cast(ConstY->getAggregateElement(i)); + if (!ConstXEl || !ConstYEl || + ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >= + (unsigned)(1 << (TargetElemSize - 1))) + return nullptr; + } + } + + Value *Add = Builder.CreateAdd(X, Y); + + FixedVectorType *GEPType = cast(GEP->getType()); + if (checkOffsetSize(Add, GEPType->getNumElements())) + return Add; + else + return nullptr; +} + +Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP, + Value *&Offsets, + IRBuilder<> &Builder) { + Value *GEPPtr = GEP->getPointerOperand(); + Offsets = GEP->getOperand(1); + // We only merge geps with constant offsets, because only for those + // we can make sure that we do not cause an overflow + if (!isa(Offsets)) + return nullptr; + GetElementPtrInst *BaseGEP; + if ((BaseGEP = dyn_cast(GEPPtr))) { + // Merge the two geps into one + Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder); + if (!BaseBasePtr) + return nullptr; + Offsets = + CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder); + if (Offsets == nullptr) + return nullptr; + return BaseBasePtr; + } + return GEPPtr; +} + +bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB, + LoopInfo *LI) { + GetElementPtrInst *GEP = dyn_cast(Address); + if (!GEP) + return false; + bool Changed = false; + if (GEP->hasOneUse() && + dyn_cast(GEP->getPointerOperand())) { + IRBuilder<> Builder(GEP->getContext()); + Builder.SetInsertPoint(GEP); + Builder.SetCurrentDebugLocation(GEP->getDebugLoc()); + Value *Offsets; + Value *Base = foldGEP(GEP, Offsets, Builder); + // We only want to merge the geps if there is a real chance that they can be + // used by an MVE gather; thus the offset has to have the correct size + // (always i32 if it is not of vector type) and the base has to be a + // pointer. + if (Offsets && Base && Base != GEP) { + PointerType *BaseType = cast(Base->getType()); + GetElementPtrInst *NewAddress = GetElementPtrInst::Create( + BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP); + GEP->replaceAllUsesWith(NewAddress); + GEP = NewAddress; + Changed = true; + } + } + Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI); + return Changed; +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -995,22 +1168,17 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) { for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather) { + if (II && II->getIntrinsicID() == Intrinsic::masked_gather && + isa(II->getType())) { Gathers.push_back(II); - if (isa(II->getArgOperand(0))) - Changed |= optimiseOffsets( - cast(II->getArgOperand(0))->getOperand(1), - II->getParent(), LI); - } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) { + Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter && + isa(II->getArgOperand(0)->getType())) { Scatters.push_back(II); - if (isa(II->getArgOperand(1))) - Changed |= optimiseOffsets( - cast(II->getArgOperand(1))->getOperand(1), - II->getParent(), LI); + Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI); } } } - for (unsigned i = 0; i < Gathers.size(); i++) { IntrinsicInst *I = Gathers[i]; Value *L = lowerGather(I); diff --git a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp index 382ddd4572c74..0523ac9447c32 100644 --- a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -57,6 +57,7 @@ class MVEVPTOptimisations : public MachineFunctionPass { Register Target); bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); + bool ConvertVPSEL(MachineBasicBlock &MBB); }; char MVEVPTOptimisations::ID = 0; @@ -356,7 +357,7 @@ bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { } for (MachineInstr *DeadInstruction : DeadInstructions) - DeadInstruction->removeFromParent(); + DeadInstruction->eraseFromParent(); return Modified; } @@ -430,7 +431,46 @@ bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { } for (MachineInstr *DeadInstruction : DeadInstructions) - DeadInstruction->removeFromParent(); + DeadInstruction->eraseFromParent(); + + return !DeadInstructions.empty(); +} + +// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a +// somewhat blunt approximation to allow tail predicated with vpsel +// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly +// different semantics under tail predication. Until that is modelled we just +// convert to a VMOVT (via a predicated VORR) instead. +bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) { + bool HasVCTP = false; + SmallVector DeadInstructions; + + for (MachineInstr &MI : MBB.instrs()) { + if (isVCTP(&MI)) { + HasVCTP = true; + continue; + } + + if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL) + continue; + + MachineInstrBuilder MIBuilder = + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)) + .addImm(ARMVCC::Then) + .add(MI.getOperand(4)) + .add(MI.getOperand(2)); + // Silence unused variable warning in release builds. + (void)MIBuilder; + LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump(); + dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump()); + DeadInstructions.push_back(&MI); + } + + for (MachineInstr *DeadInstruction : DeadInstructions) + DeadInstruction->eraseFromParent(); return !DeadInstructions.empty(); } @@ -452,6 +492,7 @@ bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { for (MachineBasicBlock &MBB : Fn) { Modified |= ReplaceVCMPsByVPNOTs(MBB); Modified |= ReduceOldVCCRValueUses(MBB); + Modified |= ConvertVPSEL(MBB); } LLVM_DEBUG(dbgs() << "**************************************\n"); diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index ae661594bdc93..0f7e190386731 100644 --- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -43,7 +43,7 @@ using namespace llvm; -#define DEBUG_TYPE "t2-reduce-size" +#define DEBUG_TYPE "thumb2-reduce-size" #define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass" STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h index 43b26fd0c5653..46dc914adf784 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h @@ -48,11 +48,6 @@ class AVRAsmBackend : public MCAsmBackend { return AVR::NumTargetFixupKinds; } - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - return false; - } - bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp index f82f166eda4d0..ac40b24596a26 100644 --- a/llvm/lib/Target/BPF/BPFMIChecking.cpp +++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp @@ -143,12 +143,10 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) { return true; // Otherwise, return true if any aliased SuperReg of GPR32 is not dead. - std::vector::iterator search_begin = GPR64DeadDefs.begin(); - std::vector::iterator search_end = GPR64DeadDefs.end(); for (auto I : GPR32LiveDefs) for (MCSuperRegIterator SR(I, TRI); SR.isValid(); ++SR) - if (std::find(search_begin, search_end, *SR) == search_end) - return true; + if (!llvm::is_contained(GPR64DeadDefs, *SR)) + return true; return false; } diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index 9d829ac45a10b..29e9d5da08364 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -43,11 +43,6 @@ class BPFAsmBackend : public MCAsmBackend { unsigned getNumFixupKinds() const override { return 1; } - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - return false; - } - bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 1e7862c36ea09..3759962c41563 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -641,7 +641,7 @@ bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; return finishBundle(IDLoc, Out); } - MCInst *SubInst = new (getParser().getContext()) MCInst; + MCInst *SubInst = getParser().getContext().createMCInst(); if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo, MatchingInlineAsm)) { if (InBrackets) diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index f3a87ef20a608..aeaeac65de960 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -175,7 +175,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, while (Result == Success && !Complete) { if (Bytes.size() < HEXAGON_INSTR_SIZE) return MCDisassembler::Fail; - MCInst *Inst = new (getContext()) MCInst; + MCInst *Inst = getContext().createMCInst(); Result = getSingleInstruction(*Inst, MI, Bytes, Address, cs, Complete); MI.addOperand(MCOperand::createInst(Inst)); Size += HEXAGON_INSTR_SIZE; @@ -384,8 +384,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, break; } MI.setOpcode(Hexagon::DuplexIClass0 + duplexIClass); - MCInst *MILow = new (getContext()) MCInst; - MCInst *MIHigh = new (getContext()) MCInst; + MCInst *MILow = getContext().createMCInst(); + MCInst *MIHigh = getContext().createMCInst(); auto TmpExtender = CurrentExtender; CurrentExtender = nullptr; // constant extenders in duplex must always be in slot 1 diff --git a/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp index 188d91355a35e..9507de95231fe 100644 --- a/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp +++ b/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp @@ -104,7 +104,7 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, HexagonMCInstrInfo::setOuterLoop(MCB); return; } - MCInst *MCI = new (AP.OutContext) MCInst; + MCInst *MCI = AP.OutContext.createMCInst(); MCI->setOpcode(MI->getOpcode()); assert(MCI->getOpcode() == static_cast(MI->getOpcode()) && "MCI opcode should have been set on construction"); diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index cc10627955fb0..c3422d595c793 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -1082,9 +1082,9 @@ def FShl32r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru), (HiReg (S2_asl_r_p (Combinew $Rs, $Rt), $Ru))>; def FShl64i: OutPatFrag<(ops node:$Rs, node:$Rt, node:$S), - (S2_lsr_i_p_or (S2_asl_i_p $Rt, $S), $Rs, (Subi<64> $S))>; + (S2_lsr_i_p_or (S2_asl_i_p $Rs, $S), $Rt, (Subi<64> $S))>; def FShl64r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru), - (S2_lsr_r_p_or (S2_asl_r_p $Rt, $Ru), $Rs, (A2_subri 64, $Ru))>; + (S2_lsr_r_p_or (S2_asl_r_p $Rs, $Ru), $Rt, (A2_subri 64, $Ru))>; // Combined SDNodeXForm: (Divu8 (Subi<64> $S)) def Divu64_8: SDNodeXFormisFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0; unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 5fe397486402e..07e59fb5585e8 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -146,8 +146,9 @@ class HexagonTTIImpl : public BasicTTIImplBase { ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr); + TTI::CastContextHint CCH, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index e7069819fa575..627c53cadd84f 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -74,7 +74,7 @@ class HexagonAsmBackend : public MCAsmBackend { void setExtender(MCContext &Context) const { if (Extender == nullptr) - const_cast(this)->Extender = new (Context) MCInst; + const_cast(this)->Extender = Context.createMCInst(); } MCInst *takeExtender() const { @@ -736,7 +736,7 @@ class HexagonAsmBackend : public MCAsmBackend { auto &Inst = const_cast(RF.getInst()); while (Size > 0 && HexagonMCInstrInfo::bundleSize(Inst) < MaxPacketSize) { - MCInst *Nop = new (Context) MCInst; + MCInst *Nop = Context.createMCInst(); Nop->setOpcode(Hexagon::A2_nop); Inst.addOperand(MCOperand::createInst(Nop)); Size -= 4; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp index 82b2074c5cd86..e7ade7834a9f4 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp @@ -210,7 +210,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, case Hexagon::A2_tfrsi: Rt = L.getOperand(0); compoundOpcode = J4_jumpseti; - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rt); @@ -223,7 +223,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, Rs = L.getOperand(1); compoundOpcode = J4_jumpsetr; - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rt); CompoundInsn->addOperand(Rs); @@ -237,7 +237,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, Rt = L.getOperand(2); compoundOpcode = cmpeqBitOpcode[getCompoundOp(R)]; - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); CompoundInsn->addOperand(Rt); @@ -250,7 +250,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, Rt = L.getOperand(2); compoundOpcode = cmpgtBitOpcode[getCompoundOp(R)]; - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); CompoundInsn->addOperand(Rt); @@ -263,7 +263,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, Rt = L.getOperand(2); compoundOpcode = cmpgtuBitOpcode[getCompoundOp(R)]; - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); CompoundInsn->addOperand(Rt); @@ -281,7 +281,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)]; Rs = L.getOperand(1); - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); CompoundInsn->addOperand(L.getOperand(2)); @@ -299,7 +299,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)]; Rs = L.getOperand(1); - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); CompoundInsn->addOperand(L.getOperand(2)); @@ -310,7 +310,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, LLVM_DEBUG(dbgs() << "CX: C2_cmpgtui\n"); Rs = L.getOperand(1); compoundOpcode = cmpgtuiBitOpcode[getCompoundOp(R)]; - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); CompoundInsn->addOperand(L.getOperand(2)); @@ -321,7 +321,7 @@ static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, LLVM_DEBUG(dbgs() << "CX: S2_tstbit_i\n"); Rs = L.getOperand(1); compoundOpcode = tstBitOpcode[getCompoundOp(R)]; - CompoundInsn = new (Context) MCInst; + CompoundInsn = Context.createMCInst(); CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); CompoundInsn->addOperand(R.getOperand(1)); diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp index 0fb27a926003f..a17afe5e62f67 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp @@ -69,11 +69,6 @@ class LanaiAsmBackend : public MCAsmBackend { return Lanai::NumTargetFixupKinds; } - bool mayNeedRelaxation(const MCInst & /*Inst*/, - const MCSubtargetInfo &STI) const override { - return false; - } - bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp index 958212dc77c9c..071e1484196bc 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp @@ -90,11 +90,6 @@ class MSP430AsmBackend : public MCAsmBackend { return Infos[Kind - FirstTargetFixupKind]; } - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - return false; - } - bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 1126b871cb114..16c7befb2670a 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -53,15 +53,6 @@ class MipsAsmBackend : public MCAsmBackend { /// @name Target Relaxation Interfaces /// @{ - /// MayNeedRelaxation - Check whether the given instruction may need - /// relaxation. - /// - /// \param Inst - The instruction to test. - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - return false; - } - /// fixupNeedsRelaxation - Target specific predicate for whether a given /// fixup requires the associated instruction to be relaxed. bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp index 155d19ba6959a..18904d0051491 100644 --- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -182,7 +182,7 @@ namespace { /// memory instruction can be moved to a delay slot. class MemDefsUses : public InspectMemInstr { public: - MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI); + explicit MemDefsUses(const MachineFrameInfo *MFI); private: using ValueType = PointerUnion; @@ -200,7 +200,6 @@ namespace { const MachineFrameInfo *MFI; SmallPtrSet Uses, Defs; - const DataLayout &DL; /// Flags indicating whether loads or stores with no underlying objects have /// been seen. @@ -492,8 +491,8 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) { return true; } -MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_) - : InspectMemInstr(false), MFI(MFI_), DL(DL) {} +MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_) + : InspectMemInstr(false), MFI(MFI_) {} bool MemDefsUses::hasHazard_(const MachineInstr &MI) { bool HasHazard = false; @@ -542,7 +541,7 @@ getUnderlyingObjects(const MachineInstr &MI, if (const Value *V = MMO.getValue()) { SmallVector Objs; - GetUnderlyingObjects(V, Objs, DL); + ::getUnderlyingObjects(V, Objs); for (const Value *UValue : Objs) { if (!isIdentifiedObject(V)) @@ -775,7 +774,7 @@ bool MipsDelaySlotFiller::searchBackward(MachineBasicBlock &MBB, auto *Fn = MBB.getParent(); RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo()); - MemDefsUses MemDU(Fn->getDataLayout(), &Fn->getFrameInfo()); + MemDefsUses MemDU(&Fn->getFrameInfo()); ReverseIter Filler; RegDU.init(Slot); @@ -851,7 +850,7 @@ bool MipsDelaySlotFiller::searchSuccBBs(MachineBasicBlock &MBB, IM.reset(new LoadFromStackOrConst()); } else { const MachineFrameInfo &MFI = Fn->getFrameInfo(); - IM.reset(new MemDefsUses(Fn->getDataLayout(), &MFI)); + IM.reset(new MemDefsUses(&MFI)); } if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot, diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index 16b4d51d3ca63..0c5df4ba1bade 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -365,14 +365,6 @@ class TargetRegisterClass; return ABI.IsN64() ? Mips::A1_64 : Mips::A1; } - /// Returns true if a cast between SrcAS and DestAS is a noop. - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { - // Mips doesn't have any special address spaces so we just reserve - // the first 256 for software use (e.g. OpenCL) and treat casts - // between them as noops. - return SrcAS < 256 && DestAS < 256; - } - bool isJumpTableRelative() const override { return getTargetMachine().isPositionIndependent(); } diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.h b/llvm/lib/Target/Mips/MipsTargetMachine.h index 25300504a02dc..e0de924be4fd1 100644 --- a/llvm/lib/Target/Mips/MipsTargetMachine.h +++ b/llvm/lib/Target/Mips/MipsTargetMachine.h @@ -63,6 +63,14 @@ class MipsTargetMachine : public LLVMTargetMachine { return TLOF.get(); } + /// Returns true if a cast between SrcAS and DestAS is a noop. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Mips doesn't have any special address spaces so we just reserve + // the first 256 for software use (e.g. OpenCL) and treat casts + // between them as noops. + return SrcAS < 256 && DestAS < 256; + } + bool isLittleEndian() const { return isLittle; } const MipsABIInfo &getABI() const { return ABI; } }; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 4296eca6a8dfd..2b2142cc6c212 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -700,12 +700,11 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, bool IsKernelFn = isKernelFunction(F->getFunction()); - // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly + // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly // because the former looks through phi nodes while the latter does not. We // need to look through phi nodes to handle pointer induction variables. SmallVector Objs; - GetUnderlyingObjects(N->getMemOperand()->getValue(), - Objs, F->getDataLayout()); + getUnderlyingObjects(N->getMemOperand()->getValue(), Objs); return all_of(Objs, [&](const Value *V) { if (auto *A = dyn_cast(V)) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index f45cc06e0a0a3..916d515af5dac 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -19,6 +19,7 @@ #include "NVPTXTargetObjectFile.h" #include "NVPTXUtilities.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/Analysis.h" @@ -2438,8 +2439,7 @@ static bool isImageOrSamplerVal(const Value *arg, const Module *context) { if (!STy || STy->isLiteral()) return false; - return std::find(std::begin(specialTypes), std::end(specialTypes), - STy->getName()) != std::end(specialTypes); + return llvm::is_contained(specialTypes, STy->getName()); } SDValue NVPTXTargetLowering::LowerFormalArguments( diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index e60b5eeacdaee..d6859337f085f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -214,8 +214,7 @@ bool NVPTXLowerArgs::runOnKernelFunction(Function &F) { for (auto &I : B) { if (LoadInst *LI = dyn_cast(&I)) { if (LI->getType()->isPointerTy()) { - Value *UO = GetUnderlyingObject(LI->getPointerOperand(), - F.getParent()->getDataLayout()); + Value *UO = getUnderlyingObject(LI->getPointerOperand()); if (Argument *Arg = dyn_cast(UO)) { if (Arg->hasByValAttr()) { // LI is a load from a pointer within a byval kernel parameter. diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 13fd7d05ab9f4..81008d3ea5662 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -492,21 +492,6 @@ struct PPCOperand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()])); } - void addRegQFRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - - void addRegQSRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - - void addRegQBRCOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); - } - void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(RRegs[getReg()])); @@ -1207,9 +1192,6 @@ bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) { } else if (Name.startswith_lower("v") && !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { RegNo = VRegs[IntVal]; - } else if (Name.startswith_lower("q") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = QFRegs[IntVal]; } else if (Name.startswith_lower("cr") && !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { RegNo = CRRegs[IntVal]; diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt index 91021d4e584e1..5a06faa16be19 100644 --- a/llvm/lib/Target/PowerPC/CMakeLists.txt +++ b/llvm/lib/Target/PowerPC/CMakeLists.txt @@ -36,7 +36,6 @@ add_llvm_target(PowerPCCodeGen PPCMacroFusion.cpp PPCMIPeephole.cpp PPCRegisterInfo.cpp - PPCQPXLoadSplat.cpp PPCSubtarget.cpp PPCTargetMachine.cpp PPCTargetObjectFile.cpp diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 74c6fd3733f03..362ddf7204557 100644 --- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -167,12 +167,6 @@ static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass -static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo, - uint64_t Address, - const void *Decoder) { - return decodeRegisterClass(Inst, RegNo, QFRegs); -} - static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { @@ -401,14 +395,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Read the instruction in the proper endianness. uint64_t Inst = ReadFunc(Bytes.data()); - if (STI.getFeatureBits()[PPC::FeatureQPX]) { - DecodeStatus result = - decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI); - if (result != MCDisassembler::Fail) - return result; - } else if (STI.getFeatureBits()[PPC::FeatureSPE]) { + if (STI.getFeatureBits()[PPC::FeatureSPE]) { DecodeStatus result = - decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); + decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); if (result != MCDisassembler::Fail) return result; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index dbaf221db9fc9..72401668c8d07 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -46,6 +46,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { case PPC::fixup_ppc_half16ds: return Value & 0xfffc; case PPC::fixup_ppc_pcrel34: + case PPC::fixup_ppc_imm34: return Value & 0x3ffffffff; } } @@ -68,6 +69,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case PPC::fixup_ppc_br24_notoc: return 4; case PPC::fixup_ppc_pcrel34: + case PPC::fixup_ppc_imm34: case FK_Data_8: return 8; case PPC::fixup_ppc_nofixup: @@ -100,6 +102,7 @@ class PPCAsmBackend : public MCAsmBackend { { "fixup_ppc_half16", 0, 16, 0 }, { "fixup_ppc_half16ds", 0, 14, 0 }, { "fixup_ppc_pcrel34", 0, 34, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_imm34", 0, 34, 0 }, { "fixup_ppc_nofixup", 0, 0, 0 } }; const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = { @@ -112,6 +115,7 @@ class PPCAsmBackend : public MCAsmBackend { { "fixup_ppc_half16", 0, 16, 0 }, { "fixup_ppc_half16ds", 2, 14, 0 }, { "fixup_ppc_pcrel34", 0, 34, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_imm34", 0, 34, 0 }, { "fixup_ppc_nofixup", 0, 0, 0 } }; @@ -178,12 +182,6 @@ class PPCAsmBackend : public MCAsmBackend { } } - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - // FIXME. - return false; - } - bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index d8b3301e97f12..1cd190c6b04ea 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -409,6 +409,9 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, break; } break; + case PPC::fixup_ppc_imm34: + report_fatal_error("Unsupported Modifier for fixup_ppc_imm34."); + break; case FK_Data_8: switch (Modifier) { default: llvm_unreachable("Unsupported Modifier"); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index 2fb8947fd4e0f..73292f7b7938f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -43,6 +43,9 @@ enum Fixups { // A 34-bit fixup corresponding to PC-relative paddi. fixup_ppc_pcrel34, + // A 34-bit fixup corresponding to Non-PC-relative paddi. + fixup_ppc_imm34, + /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the /// TLS general and local dynamic models, or inserts the thread-pointer /// register number. diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp index 222bf2fa82836..ce1a43a0c25b2 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -49,18 +49,6 @@ FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden, void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { const char *RegName = getRegisterName(RegNo); - if (RegName[0] == 'q' /* QPX */) { - // The system toolchain on the BG/Q does not understand QPX register names - // in .cfi_* directives, so print the name of the floating-point - // subregister instead. - std::string RN(RegName); - - RN[0] = 'f'; - OS << RN; - - return; - } - OS << RegName; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index fb65e7320f2b0..8c0e0a80b1e2c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -104,20 +104,36 @@ unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo, return 0; } -uint64_t -PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { +uint64_t PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI, + MCFixupKind Fixup) const { const MCOperand &MO = MI.getOperand(OpNo); - if (MO.isReg() || MO.isImm()) + assert(!MO.isReg() && "Not expecting a register for this operand."); + if (MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); // Add a fixup for the immediate field. - Fixups.push_back(MCFixup::create(0, MO.getExpr(), - (MCFixupKind)PPC::fixup_ppc_pcrel34)); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), Fixup)); return 0; } +uint64_t +PPCMCCodeEmitter::getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return getImm34Encoding(MI, OpNo, Fixups, STI, + (MCFixupKind)PPC::fixup_ppc_imm34); +} + +uint64_t +PPCMCCodeEmitter::getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return getImm34Encoding(MI, OpNo, Fixups, STI, + (MCFixupKind)PPC::fixup_ppc_pcrel34); +} + unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h index 588aa76bd8064..4504cc6a7405e 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h @@ -52,7 +52,14 @@ class PPCMCCodeEmitter : public MCCodeEmitter { const MCSubtargetInfo &STI) const; uint64_t getImm34Encoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; + const MCSubtargetInfo &STI, + MCFixupKind Fixup) const; + uint64_t getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint64_t getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 719e005d98135..325ede0fc17ac 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -159,7 +159,6 @@ using llvm::MCPhysReg; static const MCPhysReg SPERegs[32] = PPC_REGS0_31(PPC::S); \ static const MCPhysReg VFRegs[32] = PPC_REGS0_31(PPC::VF); \ static const MCPhysReg VRegs[32] = PPC_REGS0_31(PPC::V); \ - static const MCPhysReg QFRegs[32] = PPC_REGS0_31(PPC::QF); \ static const MCPhysReg RRegsNoR0[32] = \ PPC_REGS_NO0_31(PPC::ZERO, PPC::R); \ static const MCPhysReg XRegsNoX0[32] = \ diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 3106290442afa..24a9d419d3ea5 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -44,7 +44,6 @@ namespace llvm { FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCBranchCoalescingPass(); - FunctionPass *createPPCQPXLoadSplatPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); @@ -68,7 +67,6 @@ namespace llvm { void initializePPCReduceCRLogicalsPass(PassRegistry&); void initializePPCBSelPass(PassRegistry&); void initializePPCBranchCoalescingPass(PassRegistry&); - void initializePPCQPXLoadSplatPass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCPreEmitPeepholePass(PassRegistry &); diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 9ad78bf67fe6c..cda809e2472d4 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -132,9 +132,6 @@ def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true", "Enable PPC 4xx instructions">; def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true", "Enable PPC 6xx instructions">; -def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true", - "Enable QPX instructions", - [FeatureFPU]>; def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", "Enable VSX instructions", [FeatureAltivec]>; @@ -193,7 +190,7 @@ def FeatureFloat128 : def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD", "POPCNTD_Fast", "Enable the popcnt[dw] instructions">; -// Note that for the a2/a2q processor models we should not use popcnt[dw] by +// Note that for the a2 processor models we should not use popcnt[dw] by // default. These processors do support the instructions, but they're // microcoded, and the software emulation is about twice as fast. def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD", @@ -237,6 +234,10 @@ def FeaturePCRelativeMemops : SubtargetFeature<"pcrelative-memops", "HasPCRelativeMemops", "true", "Enable PC relative Memory Ops", [FeatureISA3_0]>; +def FeaturePairedVectorMemops: + SubtargetFeature<"paired-vector-memops", "PairedVectorMemops", "true", + "32Byte load and store instructions", + [FeatureISA3_0]>; def FeaturePredictableSelectIsExpensive : SubtargetFeature<"predictable-select-expensive", @@ -342,7 +343,7 @@ def ProcessorFeatures { // still exist with the exception of those we know are Power9 specific. list P10AdditionalFeatures = [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs, - FeaturePCRelativeMemops, FeatureP10Vector]; + FeaturePCRelativeMemops, FeatureP10Vector, FeaturePairedVectorMemops]; list P10SpecificFeatures = []; list P10InheritableFeatures = !listconcat(P9InheritableFeatures, P10AdditionalFeatures); @@ -514,15 +515,6 @@ def : ProcessorModel<"a2", PPCA2Model, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>; -def : ProcessorModel<"a2q", PPCA2Model, - [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, - FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, - FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, - FeatureSTFIWX, FeatureLFIWAX, - FeatureFPRND, FeatureFPCVT, FeatureISEL, - FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, - Feature64Bit /*, Feature64BitRegs */, FeatureQPX, - FeatureMFTB]>; def : ProcessorModel<"pwr3", G5Model, [DirectivePwr3, FeatureAltivec, FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index bc869c39e3934..10962df7b872c 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -549,9 +549,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { if (Subtarget->hasSPE()) { if (PPC::F4RCRegClass.contains(Reg) || PPC::F8RCRegClass.contains(Reg) || - PPC::QBRCRegClass.contains(Reg) || - PPC::QFRCRegClass.contains(Reg) || - PPC::QSRCRegClass.contains(Reg) || PPC::VFRCRegClass.contains(Reg) || PPC::VRRCRegClass.contains(Reg) || PPC::VSFRCRegClass.contains(Reg) || @@ -1678,22 +1675,31 @@ void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) { report_fatal_error("COMDAT not yet supported by AIX."); } -static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) { - return StringSwitch(GV->getName()) - .Cases("llvm.global_ctors", "llvm.global_dtors", true) - .Default(false); +static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) { + return GV->hasAppendingLinkage() && + StringSwitch(GV->getName()) + // TODO: Update the handling of global arrays for static init when + // we support the ".ref" directive. + // Otherwise, we can skip these arrays, because the AIX linker + // collects static init functions simply based on their name. + .Cases("llvm.global_ctors", "llvm.global_dtors", true) + // TODO: Linker could still eliminate the GV if we just skip + // handling llvm.used array. Skipping them for now until we or the + // AIX OS team come up with a good solution. + .Case("llvm.used", true) + // It's correct to just skip llvm.compiler.used array here. + .Case("llvm.compiler.used", true) + .Default(false); } void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { - ValidateGV(GV); - - // TODO: Update the handling of global arrays for static init when we support - // the ".ref" directive. - // Otherwise, we can skip these arrays, because the AIX linker collects - // static init functions simply based on their name. - if (isSpecialLLVMGlobalArrayForStaticInit(GV)) + if (isSpecialLLVMGlobalArrayToSkip(GV)) return; + assert(!GV->getName().startswith("llvm.") && + "Unhandled intrinsic global variable."); + ValidateGV(GV); + // Create the symbol, set its storage class. MCSymbolXCOFF *GVSym = cast(getSymbol(GV)); GVSym->setStorageClass( @@ -1773,7 +1779,11 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() { } void PPCAIXAsmPrinter::emitFunctionEntryLabel() { - PPCAsmPrinter::emitFunctionEntryLabel(); + // It's not necessary to emit the label when we have individual + // function in its own csect. + if (!TM.getFunctionSections()) + PPCAsmPrinter::emitFunctionEntryLabel(); + // Emit aliasing label for function entry point label. llvm::for_each( GOAliasMap[&MF->getFunction()], [this](const GlobalAlias *Alias) { @@ -1836,8 +1846,11 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { // We need to know, up front, the alignment of csects for the assembly path, // because once a .csect directive gets emitted, we could not change the // alignment value on it. - for (const auto &G : M.globals()) + for (const auto &G : M.globals()) { + if (isSpecialLLVMGlobalArrayToSkip(&G)) + continue; setCsectAlignment(&G); + } for (const auto &F : M) setCsectAlignment(&F); diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td index 1eaa7f7a44b39..9a15490f1fb0d 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.td +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -61,9 +61,6 @@ def RetCC_PPC_Cold : CallingConv<[ CCIfType<[f64], CCAssignToReg<[F1]>>, CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>, - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>, - CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2]>>> @@ -98,10 +95,6 @@ def RetCC_PPC : CallingConv<[ CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, - // QPX vectors are returned in QF1 and QF2. - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, - // Vector types returned as "direct" go into V2 .. V9; note that only the // ELFv2 ABI fully utilizes all these registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], @@ -158,8 +151,6 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[ CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>> @@ -223,9 +214,6 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>, CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>, - // QPX vectors that are stored in double precision need 32-byte alignment. - CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>, - // Vectors and float128 get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>, CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>> @@ -243,10 +231,6 @@ def CC_PPC32_SVR4_VarArg : CallingConv<[ // put vector arguments in vector registers before putting them on the stack. let Entry = 1 in def CC_PPC32_SVR4 : CallingConv<[ - // QPX vectors mirror the scalar FP convention. - CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()", - CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>, - // The first 12 Vector arguments are passed in AltiVec registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 8ffd89ef5ccd2..3e218e14d8d44 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -4142,7 +4142,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { // Altivec Vector compare instructions do not set any CR register by default and // vector compare operations return the same type as the operands. if (LHS.getValueType().isVector()) { - if (Subtarget->hasQPX() || Subtarget->hasSPE()) + if (Subtarget->hasSPE()) return false; EVT VecVT = LHS.getValueType(); @@ -4813,8 +4813,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); switch (LoadedVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid PPC load type!"); - case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX - case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX case MVT::f64: Opcode = PPC::LFDUX; break; case MVT::f32: Opcode = PPC::LFSUX; break; case MVT::i32: Opcode = PPC::LWZUX; break; @@ -5095,12 +5093,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SelectCCOp = PPC::SELECT_CC_F16; else if (Subtarget->hasSPE()) SelectCCOp = PPC::SELECT_CC_SPE; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f64) - SelectCCOp = PPC::SELECT_CC_QFRC; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f32) - SelectCCOp = PPC::SELECT_CC_QSRC; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4i1) - SelectCCOp = PPC::SELECT_CC_QBRC; else if (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64) SelectCCOp = PPC::SELECT_CC_VSRC; @@ -5856,9 +5848,6 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: - case PPC::SELECT_QFRC: - case PPC::SELECT_QSRC: - case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: @@ -6177,9 +6166,6 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: - case PPC::SELECT_QFRC: - case PPC::SELECT_QSRC: - case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index edc23b2673f38..ac7f4f9c34f92 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -74,6 +74,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSectionXCOFF.h" #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/BranchProbability.h" @@ -338,6 +339,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FMA , MVT::f32, Legal); } + if (Subtarget.hasSPE()) + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root @@ -811,6 +815,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.isISA3_1()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MULHS, MVT::v2i64, Legal); + setOperationAction(ISD::MULHU, MVT::v2i64, Legal); + setOperationAction(ISD::MULHS, MVT::v4i32, Legal); + setOperationAction(ISD::MULHU, MVT::v4i32, Legal); setOperationAction(ISD::UDIV, MVT::v2i64, Legal); setOperationAction(ISD::SDIV, MVT::v2i64, Legal); setOperationAction(ISD::UDIV, MVT::v4i32, Legal); @@ -932,6 +940,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SUB, MVT::v2i64, Expand); } + setOperationAction(ISD::SETCC, MVT::v1i128, Expand); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); setOperationAction(ISD::STORE, MVT::v2i64, Promote); @@ -1091,161 +1101,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } } - if (Subtarget.hasQPX()) { - setOperationAction(ISD::FADD, MVT::v4f64, Legal); - setOperationAction(ISD::FSUB, MVT::v4f64, Legal); - setOperationAction(ISD::FMUL, MVT::v4f64, Legal); - setOperationAction(ISD::FREM, MVT::v4f64, Expand); - - setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); - setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); - - setOperationAction(ISD::LOAD , MVT::v4f64, Custom); - setOperationAction(ISD::STORE , MVT::v4f64, Custom); - - setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4f64, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); - setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); - - setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); - - setOperationAction(ISD::FNEG , MVT::v4f64, Legal); - setOperationAction(ISD::FABS , MVT::v4f64, Legal); - setOperationAction(ISD::FSIN , MVT::v4f64, Expand); - setOperationAction(ISD::FCOS , MVT::v4f64, Expand); - setOperationAction(ISD::FPOW , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); - setOperationAction(ISD::FEXP , MVT::v4f64, Expand); - setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); - - setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); - - setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); - setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); - - addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); - - setOperationAction(ISD::FADD, MVT::v4f32, Legal); - setOperationAction(ISD::FSUB, MVT::v4f32, Legal); - setOperationAction(ISD::FMUL, MVT::v4f32, Legal); - setOperationAction(ISD::FREM, MVT::v4f32, Expand); - - setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); - setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); - - setOperationAction(ISD::LOAD , MVT::v4f32, Custom); - setOperationAction(ISD::STORE , MVT::v4f32, Custom); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4f32, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); - setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); - - setOperationAction(ISD::FNEG , MVT::v4f32, Legal); - setOperationAction(ISD::FABS , MVT::v4f32, Legal); - setOperationAction(ISD::FSIN , MVT::v4f32, Expand); - setOperationAction(ISD::FCOS , MVT::v4f32, Expand); - setOperationAction(ISD::FPOW , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); - setOperationAction(ISD::FEXP , MVT::v4f32, Expand); - setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); - - setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); - - setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); - setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); - - addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); - - setOperationAction(ISD::AND , MVT::v4i1, Legal); - setOperationAction(ISD::OR , MVT::v4i1, Legal); - setOperationAction(ISD::XOR , MVT::v4i1, Legal); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); - - setOperationAction(ISD::LOAD , MVT::v4i1, Custom); - setOperationAction(ISD::STORE , MVT::v4i1, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); - - setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); - - addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); - - setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); - setOperationAction(ISD::FROUND, MVT::v4f64, Legal); - - setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); - setOperationAction(ISD::FROUND, MVT::v4f32, Legal); - - setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); - setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); - - // These need to set FE_INEXACT, and so cannot be vectorized here. - setOperationAction(ISD::FRINT, MVT::v4f64, Expand); - setOperationAction(ISD::FRINT, MVT::v4f32, Expand); - - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FDIV, MVT::v4f64, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); - - setOperationAction(ISD::FDIV, MVT::v4f32, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); - } else { - setOperationAction(ISD::FDIV, MVT::v4f64, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); - - setOperationAction(ISD::FDIV, MVT::v4f32, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); - } - - // TODO: Handle constrained floating-point operations of v4f64 - } - if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); @@ -1434,8 +1289,8 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4); - if (Subtarget.hasAltivec() || Subtarget.hasQPX()) - getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16)); + if (Subtarget.hasAltivec()) + getMaxByValAlign(Ty, Alignment, Align(16)); return Alignment.value(); } @@ -1573,12 +1428,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::VABSD: return "PPCISD::VABSD"; - case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; - case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; - case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; - case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; - case PPCISD::QBFLT: return "PPCISD::QBFLT"; - case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; @@ -1597,9 +1446,6 @@ EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, if (!VT.isVector()) return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; - if (Subtarget.hasQPX()) - return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); - return VT.changeVectorElementTypeToInteger(); } @@ -2773,16 +2619,9 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, return false; } - // PowerPC doesn't have preinc load/store instructions for vectors (except - // for QPX, which does have preinc r+r forms). - if (VT.isVector()) { - if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { - return false; - } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { - AM = ISD::PRE_INC; - return true; - } - } + // PowerPC doesn't have preinc load/store instructions for vectors + if (VT.isVector()) + return false; if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { // Common code will reject creating a pre-inc form if the base pointer @@ -3504,11 +3343,6 @@ static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13}; -/// QFPR - The set of QPX registers that should be allocated for arguments. -static const MCPhysReg QFPR[] = { - PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, - PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; - /// CalculateStackSlotSize - Calculates the size reserved for this argument on /// the stack. static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, @@ -3538,10 +3372,6 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128 || ArgVT == MVT::f128) Alignment = Align(16); - // QPX vector types stored in double-precision are padded to a 32 byte - // boundary. - else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) - Alignment = Align(32); // ByVal parameters are aligned as requested. if (Flags.isByVal()) { @@ -3573,14 +3403,11 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, /// stack slot (instead of being passed in registers). ArgOffset, /// AvailableFPRs, and AvailableVRs must hold the current argument /// position, and will be updated to account for this argument. -static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, - ISD::ArgFlagsTy Flags, - unsigned PtrByteSize, - unsigned LinkageSize, - unsigned ParamAreaSize, - unsigned &ArgOffset, +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, unsigned LinkageSize, + unsigned ParamAreaSize, unsigned &ArgOffset, unsigned &AvailableFPRs, - unsigned &AvailableVRs, bool HasQPX) { + unsigned &AvailableVRs) { bool UseMemory = false; // Respect alignment of argument on the stack. @@ -3604,11 +3431,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, // However, if the argument is actually passed in an FPR or a VR, // we don't use memory after all. if (!Flags.isByVal()) { - if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || - // QPX registers overlap with the scalar FP registers. - (HasQPX && (ArgVT == MVT::v4f32 || - ArgVT == MVT::v4f64 || - ArgVT == MVT::v4i1))) + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) if (AvailableFPRs > 0) { --AvailableFPRs; return false; @@ -3747,18 +3570,12 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( RC = &PPC::VRRCRegClass; break; case MVT::v4f32: - RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; + RC = &PPC::VRRCRegClass; break; case MVT::v2f64: case MVT::v2i64: RC = &PPC::VRRCRegClass; break; - case MVT::v4f64: - RC = &PPC::QFRCRegClass; - break; - case MVT::v4i1: - RC = &PPC::QBRCRegClass; - break; } SDValue ArgValue; @@ -3957,7 +3774,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_GPR_Regs = array_lengthof(GPR); const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; const unsigned Num_VR_Regs = array_lengthof(VR); - const unsigned Num_QFPR_Regs = Num_FPR_Regs; // Do a first pass over the arguments to determine whether the ABI // guarantees that our caller has allocated the parameter save area @@ -3976,8 +3792,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + NumBytes, AvailableFPRs, AvailableVRs)) HasParameterArea = true; } @@ -3987,7 +3802,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - unsigned &QFPR_idx = FPR_idx; SmallVector MemOps; Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; @@ -4230,51 +4044,20 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - if (!Subtarget.hasQPX()) { - // These can be scalar arguments or elements of a vector array type - // passed directly. The latter are used to implement ELFv2 homogenous - // vector aggregates. - if (VR_idx != Num_VR_Regs) { - unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); - ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++VR_idx; - } else { - if (CallConv == CallingConv::Fast) - ComputeArgOffset(); - needsLoad = true; - } - if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += 16; - break; - } // not QPX - - assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && - "Invalid QPX parameter type"); - LLVM_FALLTHROUGH; - - case MVT::v4f64: - case MVT::v4i1: - // QPX vectors are treated like their scalar floating-point subregisters - // (except that they're larger). - unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; - if (QFPR_idx != Num_QFPR_Regs) { - const TargetRegisterClass *RC; - switch (ObjectVT.getSimpleVT().SimpleTy) { - case MVT::v4f64: RC = &PPC::QFRCRegClass; break; - case MVT::v4f32: RC = &PPC::QSRCRegClass; break; - default: RC = &PPC::QBRCRegClass; break; - } - - unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++QFPR_idx; + ++VR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += Sz; + ArgOffset += 16; break; } @@ -4827,10 +4610,9 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, for (const ISD::OutputArg& Param : Outs) { if (Param.Flags.isNest()) continue; - if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize, + LinkageSize, ParamAreaSize, NumBytes, + AvailableFPRs, AvailableVRs)) return true; } return false; @@ -5344,50 +5126,38 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, Subtarget.is32BitELFABI() && !isLocalCallee() && Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_; - // On AIX, direct function calls reference the symbol for the function's - // entry point, which is named by prepending a "." before the function's - // C-linkage name. - const auto getFunctionEntryPointSymbol = [&](StringRef SymName) { - auto &Context = DAG.getMachineFunction().getMMI().getContext(); - return cast( - Context.getOrCreateSymbol(Twine(".") + Twine(SymName))); + const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) { + const TargetMachine &TM = Subtarget.getTargetMachine(); + const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering(); + MCSymbolXCOFF *S = + cast(TLOF->getFunctionEntryPointSymbol(GV, TM)); + + if (GV->isDeclaration() && !S->hasRepresentedCsectSet()) { + // On AIX, an undefined symbol needs to be associated with a + // MCSectionXCOFF to get the correct storage mapping class. + // In this case, XCOFF::XMC_PR. + const XCOFF::StorageClass SC = + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV); + auto &Context = DAG.getMachineFunction().getMMI().getContext(); + MCSectionXCOFF *Sec = Context.getXCOFFSection( + S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC, + SectionKind::getMetadata()); + S->setRepresentedCsect(Sec); + } + + MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + return DAG.getMCSymbol(S, PtrVT); }; - const auto getAIXFuncEntryPointSymbolSDNode = - [&](StringRef FuncName, bool IsDeclaration, - const XCOFF::StorageClass &SC) { - MCSymbolXCOFF *S = getFunctionEntryPointSymbol(FuncName); - - auto &Context = DAG.getMachineFunction().getMMI().getContext(); - - if (IsDeclaration && !S->hasRepresentedCsectSet()) { - // On AIX, an undefined symbol needs to be associated with a - // MCSectionXCOFF to get the correct storage mapping class. - // In this case, XCOFF::XMC_PR. - MCSectionXCOFF *Sec = Context.getXCOFFSection( - S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC, - SectionKind::getMetadata()); - S->setRepresentedCsect(Sec); - } - - MVT PtrVT = - DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); - return DAG.getMCSymbol(S, PtrVT); - }; - if (isFunctionGlobalAddress(Callee)) { - const GlobalAddressSDNode *G = cast(Callee); - const GlobalValue *GV = G->getGlobal(); + const GlobalValue *GV = cast(Callee)->getGlobal(); - if (!Subtarget.isAIXABI()) - return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0, - UsePlt ? PPCII::MO_PLT : 0); - - assert(!isa(GV) && "IFunc is not supported on AIX."); - const XCOFF::StorageClass SC = - TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV); - return getAIXFuncEntryPointSymbolSDNode(GV->getName(), GV->isDeclaration(), - SC); + if (Subtarget.isAIXABI()) { + assert(!isa(GV) && "IFunc is not supported on AIX."); + return getAIXFuncEntryPointSymbolSDNode(GV); + } + return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0, + UsePlt ? PPCII::MO_PLT : 0); } if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { @@ -5397,12 +5167,18 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, // ExternalSymbol's, then we pick up the user-declared version. const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); if (const Function *F = - dyn_cast_or_null(Mod->getNamedValue(SymName))) { - const XCOFF::StorageClass SC = - TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F); - return getAIXFuncEntryPointSymbolSDNode(F->getName(), - F->isDeclaration(), SC); - } + dyn_cast_or_null(Mod->getNamedValue(SymName))) + return getAIXFuncEntryPointSymbolSDNode(F); + + // On AIX, direct function calls reference the symbol for the function's + // entry point, which is named by prepending a "." before the function's + // C-linkage name. + const auto getFunctionEntryPointSymbol = [&](StringRef SymName) { + auto &Context = DAG.getMachineFunction().getMMI().getContext(); + return cast( + Context.getOrCreateSymbol(Twine(".") + Twine(SymName))); + }; + SymName = getFunctionEntryPointSymbol(SymName)->getName().data(); } return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(), @@ -6060,7 +5836,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - unsigned &QFPR_idx = FPR_idx; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -6074,7 +5849,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = useSoftFloat() ? 0 : 13; const unsigned NumVRs = array_lengthof(VR); - const unsigned NumQFPRs = NumFPRs; // On ELFv2, we can avoid allocating the parameter area if all the arguments // can be passed to the callee in registers. @@ -6089,9 +5863,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( for (unsigned i = 0; i != NumOps; ++i) { if (Outs[i].Flags.isNest()) continue; if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytesTmp, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytesTmp, AvailableFPRs, AvailableVRs)) HasParameterArea = true; } } @@ -6139,20 +5912,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( continue; break; case MVT::v4f32: - // When using QPX, this is handled like a FP register, otherwise, it - // is an Altivec register. - if (Subtarget.hasQPX()) { - if (++NumFPRsUsed <= NumFPRs) - continue; - } else { - if (++NumVRsUsed <= NumVRs) - continue; - } + if (++NumVRsUsed <= NumVRs) + continue; break; case MVT::f32: case MVT::f64: - case MVT::v4f64: // QPX - case MVT::v4i1: // QPX if (++NumFPRsUsed <= NumFPRs) continue; break; @@ -6514,7 +6278,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. @@ -6570,63 +6333,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (!IsFastCall) ArgOffset += 16; break; - } // not QPX - - assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && - "Invalid QPX parameter type"); - - LLVM_FALLTHROUGH; - case MVT::v4f64: - case MVT::v4i1: { - bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; - if (CFlags.IsVarArg) { - assert(HasParameterArea && - "Parameter area must exist if we have a varargs call."); - // We could elide this store in the case where the object fits - // entirely in R registers. Maybe later. - SDValue Store = - DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Store); - if (QFPR_idx != NumQFPRs) { - SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, - PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); - } - ArgOffset += (IsF32 ? 16 : 32); - for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { - if (GPR_idx == NumGPRs) - break; - SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, - DAG.getConstant(i, dl, PtrVT)); - SDValue Load = - DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - break; - } - - // Non-varargs QPX params go into registers or on the stack. - if (QFPR_idx != NumQFPRs) { - RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); - } else { - if (IsFastCall) - ComputePtrOff(); - - assert(HasParameterArea && - "Parameter area must exist to pass an argument in memory."); - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - true, CFlags.IsTailCall, true, MemOpChains, - TailCallArguments, dl); - if (IsFastCall) - ArgOffset += (IsF32 ? 16 : 32); - } - - if (!IsFastCall) - ArgOffset += (IsF32 ? 16 : 32); - break; - } } } @@ -7297,8 +7003,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( const PPCSubtarget &Subtarget = static_cast(DAG.getSubtarget()); - if (Subtarget.hasQPX()) - report_fatal_error("QPX support is not supported on AIX."); const bool IsPPC64 = Subtarget.isPPC64(); const unsigned PtrByteSize = IsPPC64 ? 8 : 4; @@ -7518,8 +7222,6 @@ SDValue PPCTargetLowering::LowerCall_AIX( const PPCSubtarget& Subtarget = static_cast(DAG.getSubtarget()); - if (Subtarget.hasQPX()) - report_fatal_error("QPX is not supported on AIX."); if (Subtarget.hasAltivec()) report_fatal_error("Altivec support is unimplemented on AIX."); @@ -7987,8 +7689,6 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, } SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) - return LowerVectorLoad(Op, DAG); assert(Op.getValueType() == MVT::i1 && "Custom lowering only for i1 loads"); @@ -8012,9 +7712,6 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - if (Op.getOperand(1).getValueType().isVector()) - return LowerVectorStore(Op, DAG); - assert(Op.getOperand(1).getValueType() == MVT::i1 && "Custom lowering only for i1 stores"); @@ -8100,9 +7797,9 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - // Not FP? Not a fsel. + // Not FP, or using SPE? Not a fsel. if (!Op.getOperand(0).getValueType().isFloatingPoint() || - !Op.getOperand(2).getValueType().isFloatingPoint()) + !Op.getOperand(2).getValueType().isFloatingPoint() || Subtarget.hasSPE()) return Op; ISD::CondCode CC = cast(Op.getOperand(4))->get(); @@ -8218,36 +7915,41 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { return Op; } -void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, - SelectionDAG &DAG, - const SDLoc &dl) const { - assert(Op.getOperand(0).getValueType().isFloatingPoint()); +static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + SDLoc dl(Op); + bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; SDValue Src = Op.getOperand(0); + assert(Src.getValueType().isFloatingPoint()); if (Src.getValueType() == MVT::f32) Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); - - SDValue Tmp; + SDValue Conv; switch (Op.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: - Tmp = DAG.getNode( - Op.getOpcode() == ISD::FP_TO_SINT - ? PPCISD::FCTIWZ - : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), + Conv = DAG.getNode( + IsSigned ? PPCISD::FCTIWZ + : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), dl, MVT::f64, Src); break; case MVT::i64: - assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && + assert((IsSigned || Subtarget.hasFPCVT()) && "i64 FP_TO_UINT is supported only with FPCVT"); - Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : - PPCISD::FCTIDUZ, - dl, MVT::f64, Src); - break; + Conv = DAG.getNode(IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ, dl, + MVT::f64, Src); } + return Conv; +} + +void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, + SelectionDAG &DAG, + const SDLoc &dl) const { + SDValue Tmp = convertFPToInt(Op, DAG, Subtarget); + bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; // Convert the FP value to an int value through memory. bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && - (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); + (IsSigned || Subtarget.hasFPCVT()); SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); int FI = cast(FIPtr)->getIndex(); MachinePointerInfo MPI = @@ -8288,51 +7990,25 @@ SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { assert(Op.getOperand(0).getValueType().isFloatingPoint()); - SDValue Src = Op.getOperand(0); - - if (Src.getValueType() == MVT::f32) - Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); - - SDValue Tmp; - switch (Op.getSimpleValueType().SimpleTy) { - default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); - case MVT::i32: - Tmp = DAG.getNode( - Op.getOpcode() == ISD::FP_TO_SINT - ? PPCISD::FCTIWZ - : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), - dl, MVT::f64, Src); - Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); - break; - case MVT::i64: - assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && - "i64 FP_TO_UINT is supported only with FPCVT"); - Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : - PPCISD::FCTIDUZ, - dl, MVT::f64, Src); - Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); - break; - } - return Tmp; + return DAG.getNode(PPCISD::MFVSR, dl, Op.getSimpleValueType().SimpleTy, + convertFPToInt(Op, DAG, Subtarget)); } SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { - + SDValue Src = Op.getOperand(0); // FP to INT conversions are legal for f128. - if (Op->getOperand(0).getValueType() == MVT::f128) + if (Src.getValueType() == MVT::f128) return Op; // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on // PPC (the libcall is not available). - if (Op.getOperand(0).getValueType() == MVT::ppcf128) { + if (Src.getValueType() == MVT::ppcf128) { if (Op.getValueType() == MVT::i32) { if (Op.getOpcode() == ISD::FP_TO_SINT) { - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, - MVT::f64, Op.getOperand(0), + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src, DAG.getIntPtrConstant(0, dl)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, - MVT::f64, Op.getOperand(0), + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src, DAG.getIntPtrConstant(1, dl)); // Add the two halves of the long double in round-to-zero mode. @@ -8348,15 +8024,12 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X // FIXME: generated code sucks. // TODO: Are there fast-math-flags to propagate to this FSUB? - SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, - Op.getOperand(0), Tmp); + SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, Src, Tmp); True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True); True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, DAG.getConstant(0x80000000, dl, MVT::i32)); - SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, - Op.getOperand(0)); - return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False, - ISD::SETGE); + SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); + return DAG.getSelectCC(dl, Src, Tmp, True, False, ISD::SETGE); } } @@ -8475,6 +8148,19 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const { return false; } +static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP; + SDLoc dl(Op); + // If we have FCFIDS, then use it when converting to single-precision. + // Otherwise, convert to double-precision and then round. + bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT(); + unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS) + : (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU); + EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64; + return DAG.getNode(ConvOpc, dl, ConvTy, Src); +} + /// Custom lowers integer to floating point conversions to use /// the direct move instructions available in ISA 2.07 to avoid the /// need for load/store combinations. @@ -8486,25 +8172,12 @@ SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, "Invalid floating point type as target of conversion"); assert(Subtarget.hasFPCVT() && "Int to FP conversions with direct moves require FPCVT"); - SDValue FP; SDValue Src = Op.getOperand(0); - bool SinglePrec = Op.getValueType() == MVT::f32; bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; - unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : - (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); - - if (WordInt) { - FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, - dl, MVT::f64, Src); - FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); - } - else { - FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); - FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); - } - - return FP; + unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA; + SDValue Mov = DAG.getNode(MovOpc, dl, MVT::f64, Src); + return convertIntToFP(Op, Mov, DAG, Subtarget); } static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) { @@ -8580,8 +8253,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + SDValue Src = Op.getOperand(0); + bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP; - EVT InVT = Op.getOperand(0).getValueType(); + EVT InVT = Src.getValueType(); EVT OutVT = Op.getValueType(); if (OutVT.isVector() && OutVT.isFloatingPoint() && isOperationCustom(Op.getOpcode(), InVT)) @@ -8591,33 +8266,12 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType() == MVT::f128) return Op; - if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { - if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) - return SDValue(); - - SDValue Value = Op.getOperand(0); - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - if (Op.getValueType() != MVT::v4f64) - Value = DAG.getNode(ISD::FP_ROUND, dl, - Op.getValueType(), Value, - DAG.getIntPtrConstant(1, dl)); - return Value; - } - // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); - if (Op.getOperand(0).getValueType() == MVT::i1) - return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), + if (Src.getValueType() == MVT::i1) + return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Src, DAG.getConstantFP(1.0, dl, Op.getValueType()), DAG.getConstantFP(0.0, dl, Op.getValueType())); @@ -8627,22 +8281,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, Subtarget.isPPC64() && Subtarget.hasFPCVT()) return LowerINT_TO_FPDirectMove(Op, DAG, dl); - assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && + assert((IsSigned || Subtarget.hasFPCVT()) && "UINT_TO_FP is supported only with FPCVT"); - // If we have FCFIDS, then use it when converting to single-precision. - // Otherwise, convert to double-precision and then round. - unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) - ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS - : PPCISD::FCFIDS) - : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU - : PPCISD::FCFID); - MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) - ? MVT::f32 - : MVT::f64; - - if (Op.getOperand(0).getValueType() == MVT::i64) { - SDValue SINT = Op.getOperand(0); + if (Src.getValueType() == MVT::i64) { + SDValue SINT = Src; // When converting to single-precision, we actually need to convert // to double-precision first and then round to single-precision. // To avoid double-rounding effects during that operation, we have @@ -8755,7 +8398,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, } else Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); - SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); + SDValue FP = convertIntToFP(Op, Bits, DAG, Subtarget); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, @@ -8763,7 +8406,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, return FP; } - assert(Op.getOperand(0).getValueType() == MVT::i32 && + assert(Src.getValueType() == MVT::i32 && "Unhandled INT_TO_FP type in custom expander!"); // Since we only generate this in 64-bit mode, we can take advantage of // 64-bit registers. In particular, sign extend the input value into the @@ -8777,15 +8420,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { ReuseLoadInfo RLI; bool ReusingLoad; - if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, - DAG))) { + if (!(ReusingLoad = canReuseLoadAddress(Src, MVT::i32, RLI, DAG))) { int FrameIdx = MFI.CreateStackObject(4, Align(4), false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Store = - DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), FrameIdx)); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Src, FIdx, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FrameIdx)); assert(cast(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); @@ -8801,10 +8442,9 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; - Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? - PPCISD::LFIWZX : PPCISD::LFIWAX, - dl, DAG.getVTList(MVT::f64, MVT::Other), - Ops, MVT::i32, MMO); + Ld = DAG.getMemIntrinsicNode(IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl, + DAG.getVTList(MVT::f64, MVT::Other), Ops, + MVT::i32, MMO); if (ReusingLoad) spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); } else { @@ -8814,8 +8454,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, int FrameIdx = MFI.CreateStackObject(8, Align(8), false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, - Op.getOperand(0)); + SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Src); // STD the extended value into the stack slot. SDValue Store = DAG.getStore( @@ -8829,7 +8468,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, } // FCFID it and return it. - SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); + SDValue FP = convertIntToFP(Op, Ld, DAG, Subtarget); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); @@ -9126,13 +8765,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { Op0.getOperand(1)); } -static const SDValue *getNormalLoadInput(const SDValue &Op) { +static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) { const SDValue *InputLoad = &Op; if (InputLoad->getOpcode() == ISD::BITCAST) InputLoad = &InputLoad->getOperand(0); if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR || - InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) + InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) { + IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED; InputLoad = &InputLoad->getOperand(0); + } if (InputLoad->getOpcode() != ISD::LOAD) return nullptr; LoadSDNode *LD = cast(*InputLoad); @@ -9178,110 +8819,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); - if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { - // We first build an i32 vector, load it into a QPX register, - // then convert it to a floating-point vector and compare it - // to a zero vector to get the boolean result. - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - assert(BVN->getNumOperands() == 4 && - "BUILD_VECTOR for v4i1 does not have 4 operands"); - - bool IsConst = true; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) continue; - if (!isa(BVN->getOperand(i))) { - IsConst = false; - break; - } - } - - if (IsConst) { - Constant *One = - ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); - Constant *NegOne = - ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); - - Constant *CV[4]; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) - CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); - else if (isNullConstant(BVN->getOperand(i))) - CV[i] = NegOne; - else - CV[i] = One; - } - - Constant *CP = ConstantVector::get(CV); - SDValue CPIdx = - DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16)); - - SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; - SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); - return DAG.getMemIntrinsicNode( - PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); - } - - SmallVector Stores; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) continue; - - unsigned Offset = 4*i; - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); - if (StoreSize > 4) { - Stores.push_back( - DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, - PtrInfo.getWithOffset(Offset), MVT::i32)); - } else { - SDValue StoreValue = BVN->getOperand(i); - if (StoreSize < 4) - StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); - - Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, - PtrInfo.getWithOffset(Offset))); - } - } - - SDValue StoreChain; - if (!Stores.empty()) - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - else - StoreChain = DAG.getEntryNode(); - - // Now load from v4i32 into the QPX register; this will extend it to - // v4i64 but not yet convert it to a floating point. Nevertheless, this - // is typed as v4f64 because the QPX register integer states are not - // explicitly represented. - - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), - FIdx}; - SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); - - SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), - LoadedVect); - - SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); - - return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); - } - - // All other QPX vectors are handled by generic code. - if (Subtarget.hasQPX()) - return SDValue(); - // Check if this is a splat of a constant value. APInt APSplatBits, APSplatUndef; unsigned SplatBitSize; @@ -9304,7 +8841,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (!BVNIsConstantSplat || SplatBitSize > 32) { - const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); + bool IsPermutedLoad = false; + const SDValue *InputLoad = + getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); // Handle load-and-splat patterns as we have instructions that will do this // in one go. if (InputLoad && DAG.isSplatValue(Op, true)) { @@ -9927,7 +9466,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // If this is a load-and-splat, we can do that with a single instruction // in some cases. However if the load has multiple uses, we don't want to // combine it because that will just produce multiple loads. - const SDValue *InputLoad = getNormalLoadInput(V1); + bool IsPermutedLoad = false; + const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad); if (InputLoad && Subtarget.hasVSX() && V2.isUndef() && (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) && InputLoad->hasOneUse()) { @@ -9935,6 +9475,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); + // The splat index for permuted loads will be in the left half of the vector + // which is strictly wider than the loaded value by 8 bytes. So we need to + // adjust the splat index to point to the correct address in memory. + if (IsPermutedLoad) { + assert(isLittleEndian && "Unexpected permuted load on big endian target"); + SplatIdx += IsFourByte ? 2 : 1; + assert((SplatIdx < (IsFourByte ? 4 : 2)) && + "Splat of a value outside of the loaded memory"); + } + LoadSDNode *LD = cast(*InputLoad); // For 4-byte load-and-splat, we need Power9. if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { @@ -9944,10 +9494,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, else Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; - // If we are loading a partial vector, it does not make sense to adjust - // the base pointer. This happens with (splat (s_to_v_permuted (ld))). - if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64)) - Offset = 0; SDValue BasePtr = LD->getBasePtr(); if (Offset != 0) BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), @@ -10065,42 +9611,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } } - if (Subtarget.hasQPX()) { - if (VT.getVectorNumElements() != 4) - return SDValue(); - - if (V2.isUndef()) V2 = V1; - - int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); - if (AlignIdx != -1) { - return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, - DAG.getConstant(AlignIdx, dl, MVT::i32)); - } else if (SVOp->isSplat()) { - int SplatIdx = SVOp->getSplatIndex(); - if (SplatIdx >= 4) { - std::swap(V1, V2); - SplatIdx -= 4; - } - - return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, - DAG.getConstant(SplatIdx, dl, MVT::i32)); - } - - // Lower this into a qvgpci/qvfperm pair. - - // Compute the qvgpci literal - unsigned idx = 0; - for (unsigned i = 0; i < 4; ++i) { - int m = SVOp->getMaskElt(i); - unsigned mm = m >= 0 ? (unsigned) m : i; - idx |= mm << (3-i)*3; - } - - SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, - DAG.getConstant(idx, dl, MVT::i32)); - return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); - } - // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be // selected by the instruction selector. @@ -10688,279 +10198,6 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return Op; } -SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - SDNode *N = Op.getNode(); - - assert(N->getOperand(0).getValueType() == MVT::v4i1 && - "Unknown extract_vector_elt type"); - - SDValue Value = N->getOperand(0); - - // The first part of this is like the store lowering except that we don't - // need to track the chain. - - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to - // understand how to form the extending load. - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - // Now convert to an integer and store. - Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), - Value); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue StoreChain = DAG.getEntryNode(); - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), - Value, FIdx}; - SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); - - StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - - // Extract the value requested. - unsigned Offset = 4*cast(N->getOperand(1))->getZExtValue(); - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - SDValue IntVal = - DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); - - if (!Subtarget.useCRBits()) - return IntVal; - - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); -} - -/// Lowering for QPX v4i1 loads -SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - LoadSDNode *LN = cast(Op.getNode()); - SDValue LoadChain = LN->getChain(); - SDValue BasePtr = LN->getBasePtr(); - - if (Op.getValueType() == MVT::v4f64 || - Op.getValueType() == MVT::v4f32) { - EVT MemVT = LN->getMemoryVT(); - unsigned Alignment = LN->getAlignment(); - - // If this load is properly aligned, then it is legal. - if (Alignment >= MemVT.getStoreSize()) - return Op; - - EVT ScalarVT = Op.getValueType().getScalarType(), - ScalarMemVT = MemVT.getScalarType(); - unsigned Stride = ScalarMemVT.getStoreSize(); - - SDValue Vals[4], LoadChains[4]; - for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Load; - if (ScalarVT != ScalarMemVT) - Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, - BasePtr, - LN->getPointerInfo().getWithOffset(Idx * Stride), - ScalarMemVT, MinAlign(Alignment, Idx * Stride), - LN->getMemOperand()->getFlags(), LN->getAAInfo()); - else - Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, - LN->getPointerInfo().getWithOffset(Idx * Stride), - MinAlign(Alignment, Idx * Stride), - LN->getMemOperand()->getFlags(), LN->getAAInfo()); - - if (Idx == 0 && LN->isIndexed()) { - assert(LN->getAddressingMode() == ISD::PRE_INC && - "Unknown addressing mode on vector load"); - Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), - LN->getAddressingMode()); - } - - Vals[Idx] = Load; - LoadChains[Idx] = Load.getValue(1); - - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, dl, - BasePtr.getValueType())); - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); - - if (LN->isIndexed()) { - SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; - return DAG.getMergeValues(RetOps, dl); - } - - SDValue RetOps[] = { Value, TF }; - return DAG.getMergeValues(RetOps, dl); - } - - assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); - assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); - - // To lower v4i1 from a byte array, we load the byte elements of the - // vector and then reuse the BUILD_VECTOR logic. - - SDValue VectElmts[4], VectElmtChains[4]; - for (unsigned i = 0; i < 4; ++i) { - SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - - VectElmts[i] = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, - LN->getPointerInfo().getWithOffset(i), MVT::i8, - /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); - VectElmtChains[i] = VectElmts[i].getValue(1); - } - - LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); - SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); - - SDValue RVals[] = { Value, LoadChain }; - return DAG.getMergeValues(RVals, dl); -} - -/// Lowering for QPX v4i1 stores -SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - StoreSDNode *SN = cast(Op.getNode()); - SDValue StoreChain = SN->getChain(); - SDValue BasePtr = SN->getBasePtr(); - SDValue Value = SN->getValue(); - - if (Value.getValueType() == MVT::v4f64 || - Value.getValueType() == MVT::v4f32) { - EVT MemVT = SN->getMemoryVT(); - unsigned Alignment = SN->getAlignment(); - - // If this store is properly aligned, then it is legal. - if (Alignment >= MemVT.getStoreSize()) - return Op; - - EVT ScalarVT = Value.getValueType().getScalarType(), - ScalarMemVT = MemVT.getScalarType(); - unsigned Stride = ScalarMemVT.getStoreSize(); - - SDValue Stores[4]; - for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, - DAG.getVectorIdxConstant(Idx, dl)); - SDValue Store; - if (ScalarVT != ScalarMemVT) - Store = - DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, - SN->getPointerInfo().getWithOffset(Idx * Stride), - ScalarMemVT, MinAlign(Alignment, Idx * Stride), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - else - Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, - SN->getPointerInfo().getWithOffset(Idx * Stride), - MinAlign(Alignment, Idx * Stride), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - - if (Idx == 0 && SN->isIndexed()) { - assert(SN->getAddressingMode() == ISD::PRE_INC && - "Unknown addressing mode on vector store"); - Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), - SN->getAddressingMode()); - } - - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, dl, - BasePtr.getValueType())); - Stores[Idx] = Store; - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - - if (SN->isIndexed()) { - SDValue RetOps[] = { TF, Stores[0].getValue(1) }; - return DAG.getMergeValues(RetOps, dl); - } - - return TF; - } - - assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); - assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); - - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to - // understand how to form the extending load. - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - // Now convert to an integer and store. - Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), - Value); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), - Value, FIdx}; - SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); - - StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - - // Move data into the byte array. - SDValue Loads[4], LoadChains[4]; - for (unsigned i = 0; i < 4; ++i) { - unsigned Offset = 4*i; - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, - PtrInfo.getWithOffset(Offset)); - LoadChains[i] = Loads[i].getValue(1); - } - - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - - SDValue Stores[4]; - for (unsigned i = 0; i < 4; ++i) { - SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - - Stores[i] = DAG.getTruncStore( - StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), - MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), - SN->getAAInfo()); - } - - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - - return StoreChain; -} - SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (Op.getValueType() == MVT::v4i32) { @@ -11189,7 +10426,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); @@ -12133,9 +11369,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } else if (MI.getOpcode() == PPC::SELECT_CC_F4 || MI.getOpcode() == PPC::SELECT_CC_F8 || MI.getOpcode() == PPC::SELECT_CC_F16 || - MI.getOpcode() == PPC::SELECT_CC_QFRC || - MI.getOpcode() == PPC::SELECT_CC_QSRC || - MI.getOpcode() == PPC::SELECT_CC_QBRC || MI.getOpcode() == PPC::SELECT_CC_VRRC || MI.getOpcode() == PPC::SELECT_CC_VSFRC || MI.getOpcode() == PPC::SELECT_CC_VSSRC || @@ -12145,9 +11378,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_F16 || - MI.getOpcode() == PPC::SELECT_QFRC || - MI.getOpcode() == PPC::SELECT_QSRC || - MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_SPE || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_VRRC || @@ -12185,9 +11415,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F16 || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_SPE || - MI.getOpcode() == PPC::SELECT_QFRC || - MI.getOpcode() == PPC::SELECT_QSRC || - MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || @@ -12670,8 +11897,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); // Set rounding mode to round-to-zero. - BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); - BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)) + .addImm(31) + .addReg(PPC::RM, RegState::ImplicitDefine); + + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)) + .addImm(30) + .addReg(PPC::RM, RegState::ImplicitDefine); // Perform addition. BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); @@ -12733,10 +11965,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // the immediate to set the bits 62:63 of FPSCR. unsigned Mode = MI.getOperand(1).getImm(); BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0)) - .addImm(31); + .addImm(31) + .addReg(PPC::RM, RegState::ImplicitDefine); BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0)) - .addImm(30); + .addImm(30) + .addReg(PPC::RM, RegState::ImplicitDefine); } else if (MI.getOpcode() == PPC::SETRND) { DebugLoc dl = MI.getDebugLoc(); @@ -12880,9 +12114,7 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX()) || - (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); @@ -12901,9 +12133,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRES()) || (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX()) || - (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); @@ -13001,24 +12231,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; - case Intrinsic::ppc_qpx_qvlfd: - case Intrinsic::ppc_qpx_qvlfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfs: - case Intrinsic::ppc_qpx_qvlfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcd: - case Intrinsic::ppc_qpx_qvlfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcs: - case Intrinsic::ppc_qpx_qvlfcsa: - VT = MVT::v2f32; - break; - case Intrinsic::ppc_qpx_qvlfiwa: - case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_vsx_lxvw4x: @@ -13047,24 +12259,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast(N->getOperand(1))->getZExtValue()) { default: return false; - case Intrinsic::ppc_qpx_qvstfd: - case Intrinsic::ppc_qpx_qvstfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfs: - case Intrinsic::ppc_qpx_qvstfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcd: - case Intrinsic::ppc_qpx_qvstfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcs: - case Intrinsic::ppc_qpx_qvstfcsa: - VT = MVT::v2f32; - break; - case Intrinsic::ppc_qpx_qvstfiw: - case Intrinsic::ppc_qpx_qvstfiwa: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_vsx_stxvw4x: @@ -15062,18 +14256,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty); - Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); - Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy); if (LD->isUnindexed() && VT.isVector() && ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && // P8 and later hardware should just use LOAD. !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v4f32)) || - (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && - LD->getAlign() >= ScalarABIAlignment)) && + VT == MVT::v4f32))) && LD->getAlign() < ABIAlignment) { - // This is a type-legal unaligned Altivec or QPX load. + // This is a type-legal unaligned Altivec load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); bool isLittleEndian = Subtarget.isLittleEndian(); @@ -15104,24 +14294,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // optimization later. Intrinsic::ID Intr, IntrLD, IntrPerm; MVT PermCntlTy, PermTy, LDTy; - if (Subtarget.hasAltivec()) { - Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : - Intrinsic::ppc_altivec_lvsl; - IntrLD = Intrinsic::ppc_altivec_lvx; - IntrPerm = Intrinsic::ppc_altivec_vperm; - PermCntlTy = MVT::v16i8; - PermTy = MVT::v4i32; - LDTy = MVT::v4i32; - } else { - Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : - Intrinsic::ppc_qpx_qvlpcls; - IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : - Intrinsic::ppc_qpx_qvlfs; - IntrPerm = Intrinsic::ppc_qpx_qvfperm; - PermCntlTy = MVT::v4f64; - PermTy = MVT::v4f64; - LDTy = MemVT.getSimpleVT(); - } + Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr + : Intrinsic::ppc_altivec_lvsl; + IntrLD = Intrinsic::ppc_altivec_lvx; + IntrPerm = Intrinsic::ppc_altivec_vperm; + PermCntlTy = MVT::v16i8; + PermTy = MVT::v4i32; + LDTy = MVT::v4i32; SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); @@ -15192,10 +14371,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != PermTy) - Perm = Subtarget.hasAltivec() ? - DAG.getNode(ISD::BITCAST, dl, VT, Perm) : - DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX - DAG.getTargetConstant(1, dl, MVT::i64)); + Perm = Subtarget.hasAltivec() + ? DAG.getNode(ISD::BITCAST, dl, VT, Perm) + : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, + DAG.getTargetConstant(1, dl, MVT::i64)); // second argument is 1 because this rounding // is always exact. @@ -15211,14 +14390,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, unsigned IID = cast(N->getOperand(0))->getZExtValue(); Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl); - if ((IID == Intr || - IID == Intrinsic::ppc_qpx_qvlpcld || - IID == Intrinsic::ppc_qpx_qvlpcls) && - N->getOperand(1)->getOpcode() == ISD::ADD) { + if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); - int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? - 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; + int Bits = 4 /* 16 byte alignment */; if (DAG.MaskedValueIsZero(Add->getOperand(1), APInt::getAllOnesValue(Bits /* alignment */) @@ -15228,7 +14403,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && - cast(UI->getOperand(0))->getZExtValue() == IID) { + cast(UI->getOperand(0))->getZExtValue() == + IID) { // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. @@ -15777,17 +14953,9 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::F4RCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::F8RCRegClass); - if (VT == MVT::v4f64 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QFRCRegClass); - if (VT == MVT::v4f32 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QSRCRegClass); } break; case 'v': - if (VT == MVT::v4f64 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QFRCRegClass); - if (VT == MVT::v4f32 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); break; @@ -15923,9 +15091,15 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, // by AM is legal for this target, for a load/store of the specified type. bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS, Instruction *I) const { - // PPC does not allow r+i addressing modes for vectors! - if (Ty->isVectorTy() && AM.BaseOffs != 0) + unsigned AS, + Instruction *I) const { + // Vector type r+i form is supported since power9 as DQ form. We don't check + // the offset matching DQ form requirement(off % 16 == 0), because on PowerPC, + // imm form is preferred and the offset can be adjusted to use imm form later + // in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and + // max offset to check legal addressing mode, we should be a little aggressive + // to contain other offsets for that LSRUse. + if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector()) return false; // PPC allows a sign-extended 16-bit immediate field. @@ -16079,12 +15253,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvlfd: - case Intrinsic::ppc_qpx_qvlfs: - case Intrinsic::ppc_qpx_qvlfcd: - case Intrinsic::ppc_qpx_qvlfcs: - case Intrinsic::ppc_qpx_qvlfiwa: - case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: @@ -16106,18 +15274,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_vsx_lxvd2x: VT = MVT::v2f64; break; - case Intrinsic::ppc_qpx_qvlfd: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfs: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcd: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcs: - VT = MVT::v2f32; - break; default: VT = MVT::v4i32; break; @@ -16132,45 +15288,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad; return true; } - case Intrinsic::ppc_qpx_qvlfda: - case Intrinsic::ppc_qpx_qvlfsa: - case Intrinsic::ppc_qpx_qvlfcda: - case Intrinsic::ppc_qpx_qvlfcsa: - case Intrinsic::ppc_qpx_qvlfiwaa: - case Intrinsic::ppc_qpx_qvlfiwza: { - EVT VT; - switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvlfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcsa: - VT = MVT::v2f32; - break; - default: - VT = MVT::v4i32; - break; - } - - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = VT; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.size = VT.getStoreSize(); - Info.align = Align(1); - Info.flags = MachineMemOperand::MOLoad; - return true; - } - case Intrinsic::ppc_qpx_qvstfd: - case Intrinsic::ppc_qpx_qvstfs: - case Intrinsic::ppc_qpx_qvstfcd: - case Intrinsic::ppc_qpx_qvstfcs: - case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: @@ -16192,18 +15309,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_vsx_stxvd2x: VT = MVT::v2f64; break; - case Intrinsic::ppc_qpx_qvstfd: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfs: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcd: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcs: - VT = MVT::v2f32; - break; default: VT = MVT::v4i32; break; @@ -16218,39 +15323,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } - case Intrinsic::ppc_qpx_qvstfda: - case Intrinsic::ppc_qpx_qvstfsa: - case Intrinsic::ppc_qpx_qvstfcda: - case Intrinsic::ppc_qpx_qvstfcsa: - case Intrinsic::ppc_qpx_qvstfiwa: { - EVT VT; - switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvstfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcsa: - VT = MVT::v2f32; - break; - default: - VT = MVT::v4i32; - break; - } - - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = VT; - Info.ptrVal = I.getArgOperand(1); - Info.offset = 0; - Info.size = VT.getStoreSize(); - Info.align = Align(1); - Info.flags = MachineMemOperand::MOStore; - return true; - } default: break; } @@ -16263,14 +15335,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, EVT PPCTargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { - // When expanding a memset, require at least two QPX instructions to cover - // the cost of loading the value to be stored from the constant pool. - if (Subtarget.hasQPX() && Op.size() >= 32 && - (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) && - !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { - return MVT::v4f64; - } - // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. if (Subtarget.hasAltivec() && Op.size() >= 16 && @@ -16489,7 +15553,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles( if (VT == MVT::v2i64) return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves - if (Subtarget.hasVSX() || Subtarget.hasQPX()) + if (Subtarget.hasVSX()) return true; return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); @@ -16535,8 +15599,7 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, switch (Opc) { case PPCISD::FNMSUB: - // TODO: QPX subtarget is deprecated. No transformation here. - if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX()) + if (!Op.hasOneUse() || !isTypeLegal(VT)) break; const TargetOptions &Options = getTargetMachine().Options; @@ -17017,8 +16080,7 @@ SDValue PPCTargetLowering::combineFMALike(SDNode *N, bool LegalOps = !DCI.isBeforeLegalizeOps(); SDLoc Loc(N); - // TODO: QPX subtarget is deprecated. No transformation here. - if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT)) + if (!isOperationLegal(ISD::FMA, VT)) return SDValue(); // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0 diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 768eaa43e0135..80588a1bd4019 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -427,22 +427,6 @@ namespace llvm { /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) VABSD, - /// QVFPERM = This corresponds to the QPX qvfperm instruction. - QVFPERM, - - /// QVGPCI = This corresponds to the QPX qvgpci instruction. - QVGPCI, - - /// QVALIGNI = This corresponds to the QPX qvaligni instruction. - QVALIGNI, - - /// QVESPLATI = This corresponds to the QPX qvesplati instruction. - QVESPLATI, - - /// QBFLT = Access the underlying QPX floating-point boolean - /// representation. - QBFLT, - /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or /// lower (IDX=1) half of v4f32 to v2f64. FP_EXTEND_HALF, @@ -519,10 +503,6 @@ namespace llvm { /// Store scalar integers from VSR. ST_VSR_SCAL_INT, - /// QBRC, CHAIN = QVLFSb CHAIN, Ptr - /// The 4xf32 load used for v4i1 constants. - QVLFSb, - /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes /// except they ensure that the compare input is zero-extended for /// sub-word versions because the atomic loads zero-extend. @@ -1042,11 +1022,6 @@ namespace llvm { } }; - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { - // Addrspacecasts are always noops. - return true; - } - bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI, SelectionDAG &DAG, ISD::LoadExtType ET = ISD::NON_EXTLOAD) const; diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index 632d4d9deb8a2..5ff5fc78326ba 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -642,7 +642,6 @@ class XForm_17a opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let FRA = 0; } -// Used for QPX class XForm_18 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { @@ -1781,14 +1780,6 @@ class AForm_4 opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = 0; } -// Used for QPX -class AForm_4a opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : AForm_1 { - let FRA = 0; - let FRC = 0; -} - // 1.7.13 M-Form class MForm_1 opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> @@ -2099,49 +2090,6 @@ class VX_RD5_RSp5_PS1_XO9 xo, dag OOL, dag IOL, string asmstr, let Inst{23-31} = xo; } -// Z23-Form (used by QPX) -class Z23Form_1 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : I { - bits<5> FRT; - bits<5> FRA; - bits<5> FRB; - bits<2> idx; - - let Pattern = pattern; - - bit RC = 0; // set by isRecordForm - - let Inst{6-10} = FRT; - let Inst{11-15} = FRA; - let Inst{16-20} = FRB; - let Inst{21-22} = idx; - let Inst{23-30} = xo; - let Inst{31} = RC; -} - -class Z23Form_2 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : Z23Form_1 { - let FRB = 0; -} - -class Z23Form_3 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list pattern> - : I { - bits<5> FRT; - bits<12> idx; - - let Pattern = pattern; - - bit RC = 0; // set by isRecordForm - - let Inst{6-10} = FRT; - let Inst{11-22} = idx; - let Inst{23-30} = xo; - let Inst{31} = RC; -} - class Z23Form_8 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 99e25bb130ce4..888a77464f343 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -259,14 +259,6 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case PPC::XVMULDP: case PPC::XVMULSP: case PPC::XSMULSP: - // QPX Add: - case PPC::QVFADD: - case PPC::QVFADDS: - case PPC::QVFADDSs: - // QPX Multiply: - case PPC::QVFMUL: - case PPC::QVFMULS: - case PPC::QVFMULSs: return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && Inst.getFlag(MachineInstr::MIFlag::FmNsz); // Fixed point: @@ -300,9 +292,7 @@ static const uint16_t FMAOpIdxInfo[][5] = { {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2}, {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2}, {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1}, - {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}, - {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1}, - {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}}; + {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}}; // Check if an opcode is a FMA instruction. If it is, return the index in array // FMAOpIdxInfo. Otherwise, return -1. @@ -666,7 +656,6 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::LI8: case PPC::LIS: case PPC::LIS8: - case PPC::QVGPCI: case PPC::ADDIStocHA: case PPC::ADDIStocHA8: case PPC::ADDItocL: @@ -1343,12 +1332,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) || PPC::VSSRCRegClass.contains(DestReg, SrcReg)) Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf; - else if (PPC::QFRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMR; - else if (PPC::QSRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRs; - else if (PPC::QBRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRb; else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::CROR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) @@ -1393,12 +1376,6 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) { OpcodeIndex = SOK_VectorFloat4Spill; } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_VRSaveSpill; - } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat8Spill; - } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat4Spill; - } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadBitSpill; } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SpillToVSR; } else { @@ -4647,7 +4624,8 @@ bool PPCInstrInfo::getMemOperandWithOffsetWidth( // Handle only loads/stores with base register followed by immediate offset. if (LdSt.getNumExplicitOperands() != 3) return false; - if (!LdSt.getOperand(1).isImm() || !LdSt.getOperand(2).isReg()) + if (!LdSt.getOperand(1).isImm() || + (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) return false; if (!LdSt.hasOneMemOperand()) diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 43973c627fcf1..bdcfa76505daf 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -123,9 +123,6 @@ enum SpillOpcodeKey { SOK_VectorFloat8Spill, SOK_VectorFloat4Spill, SOK_VRSaveSpill, - SOK_QuadFloat8Spill, - SOK_QuadFloat4Spill, - SOK_QuadBitSpill, SOK_SpillToVSR, SOK_SPESpill, SOK_LastOpcodeSpill // This must be last on the enum. @@ -136,32 +133,28 @@ enum SpillOpcodeKey { { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \ - PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb, \ - PPC::SPILLTOVSR_LD, PPC::EVLDD \ + PPC::RESTORE_VRSAVE, PPC::SPILLTOVSR_LD, PPC::EVLDD \ } #define Pwr9LoadOpcodes \ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ - PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, \ - PPC::QVLFDXb, PPC::SPILLTOVSR_LD \ + PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::SPILLTOVSR_LD \ } #define Pwr8StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, \ - PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, PPC::SPILLTOVSR_ST, \ - PPC::EVSTDD \ + PPC::SPILLTOVSR_ST, PPC::EVSTDD \ } #define Pwr9StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ - PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, \ - PPC::SPILLTOVSR_ST \ + PPC::SPILL_VRSAVE, PPC::SPILLTOVSR_ST \ } // Initialize arrays for load and store spill opcodes on supported subtargets. @@ -273,10 +266,10 @@ class PPCInstrInfo : public PPCGenInstrInfo { } static bool isSameClassPhysRegCopy(unsigned Opcode) { - unsigned CopyOpcodes[] = - { PPC::OR, PPC::OR8, PPC::FMR, PPC::VOR, PPC::XXLOR, PPC::XXLORf, - PPC::XSCPSGNDP, PPC::MCRF, PPC::QVFMR, PPC::QVFMRs, PPC::QVFMRb, - PPC::CROR, PPC::EVOR, -1U }; + unsigned CopyOpcodes[] = {PPC::OR, PPC::OR8, PPC::FMR, + PPC::VOR, PPC::XXLOR, PPC::XXLORf, + PPC::XSCPSGNDP, PPC::MCRF, PPC::CROR, + PPC::EVOR, -1U}; for (int i = 0; CopyOpcodes[i] != -1U; i++) if (Opcode == CopyOpcodes[i]) return true; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index c565758973bf5..771a715926fd7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -203,16 +203,6 @@ def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>; def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; -def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; -def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>; -def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>; -def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>; - -def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>; - -def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb, - [SDNPHasChain, SDNPMayLoad]>; - def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>; // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift @@ -757,7 +747,13 @@ def PPCS34ImmAsmOperand : AsmOperandClass { } def s34imm : Operand { let PrintMethod = "printS34ImmOperand"; - let EncoderMethod = "getImm34Encoding"; + let EncoderMethod = "getImm34EncodingNoPCRel"; + let ParserMatchClass = PPCS34ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<34>"; +} +def s34imm_pcrel : Operand { + let PrintMethod = "printS34ImmOperand"; + let EncoderMethod = "getImm34EncodingPCRel"; let ParserMatchClass = PPCS34ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<34>"; } @@ -1567,11 +1563,12 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst), "bc 4, $bi, $dst">; - let isReturn = 1, Uses = [LR, RM] in + let isReturn = 1, Uses = [LR, RM] in { def BCLR : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi), "bclr 12, $bi, 0", IIC_BrB, []>; def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi), "bclr 4, $bi, 0", IIC_BrB, []>; + } } let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in { @@ -2588,22 +2585,7 @@ def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB), "ftsqrt $crD, $fB", IIC_FPCompare>; -let Uses = [RM], mayRaiseFPException = 1 in { - let hasSideEffects = 0 in { - defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB), - "fctiw", "$frD, $frB", IIC_FPGeneral, - []>; - defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB), - "fctiwu", "$frD, $frB", IIC_FPGeneral, - []>; - defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB), - "fctiwz", "$frD, $frB", IIC_FPGeneral, - [(set f64:$frD, (PPCfctiwz f64:$frB))]>; - - defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB), - "frsp", "$frD, $frB", IIC_FPGeneral, - [(set f32:$frD, (any_fpround f64:$frB))]>; - +let mayRaiseFPException = 1, hasSideEffects = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB), "frin", "$frD, $frB", IIC_FPGeneral, @@ -2611,9 +2593,7 @@ let Uses = [RM], mayRaiseFPException = 1 in { defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB), "frin", "$frD, $frB", IIC_FPGeneral, [(set f32:$frD, (any_fround f32:$frB))]>; - } - let hasSideEffects = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in defm FRIPD : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB), "frip", "$frD, $frB", IIC_FPGeneral, @@ -2635,6 +2615,22 @@ let Uses = [RM], mayRaiseFPException = 1 in { defm FRIMS : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB), "frim", "$frD, $frB", IIC_FPGeneral, [(set f32:$frD, (any_ffloor f32:$frB))]>; +} + +let Uses = [RM], mayRaiseFPException = 1, hasSideEffects = 0 in { + defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiw", "$frD, $frB", IIC_FPGeneral, + []>; + defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwu", "$frD, $frB", IIC_FPGeneral, + []>; + defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctiwz f64:$frB))]>; + + defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB), + "frsp", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (any_fpround f64:$frB))]>; defm FSQRT : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB), "fsqrt", "$frD, $frB", IIC_FPSqrtD, @@ -2642,8 +2638,7 @@ let Uses = [RM], mayRaiseFPException = 1 in { defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB), "fsqrts", "$frD, $frB", IIC_FPSqrtS, [(set f32:$frD, (any_fsqrt f32:$frB))]>; - } - } +} } /// Note that FMR is defined as pseudo-ops on the PPC970 because they are @@ -2920,13 +2915,17 @@ let Uses = [RM] in { // The above pseudo gets expanded to make use of the following instructions // to manipulate FPSCR. Note that FPSCR is not modeled at the DAG level. -let Uses = [RM], Defs = [RM] in { - def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM), - "mtfsb0 $FM", IIC_IntMTFSB0, []>, - PPC970_DGroup_Single, PPC970_Unit_FPU; - def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM), - "mtfsb1 $FM", IIC_IntMTFSB0, []>, - PPC970_DGroup_Single, PPC970_Unit_FPU; + +// When FM is 30/31, we are setting the 62/63 bit of FPSCR, the implicit-def +// RM should be set. +def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM), + "mtfsb0 $FM", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; +def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM), + "mtfsb1 $FM", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + +let Defs = [RM] in { let isCodeGenOnly = 1 in def MTFSFb : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT), "mtfsf $FM, $rT", IIC_IntMTFSB0, []>, @@ -3461,7 +3460,6 @@ include "PPCInstrAltivec.td" include "PPCInstrSPE.td" include "PPCInstr64Bit.td" include "PPCInstrVSX.td" -include "PPCInstrQPX.td" include "PPCInstrHTM.td" def crnot : OutPatFrag<(ops node:$in), @@ -3903,105 +3901,47 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)), defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)), (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; -let Predicates = [HasFPU] in { -// Instantiations of CRNotPat for f32. -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; - -// Instantiations of CRNotPat for f64. -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; - -// Instantiations of CRNotPat for f128. -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; +multiclass FSetCCPat { + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; + + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOLT)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLT)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOGT)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGT)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOEQ)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETEQ)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUO)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; } -// SETCC for f32. let Predicates = [HasFPU] in { -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; +// SETCC for f32. +defm : FSetCCPat; // SETCC for f64. -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; +defm : FSetCCPat; // SETCC for f128. -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETLT)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOGT)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETGT)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOEQ)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; - +defm : FSetCCPat; } // This must be in this file because it relies on patterns defined in this file @@ -4323,9 +4263,11 @@ def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L), def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA), "mcrfs $BF, $BFA", IIC_BrMCR>; +// If W is 0 and BF is 7, the 60:63 bits will be set, we should set the +// implicit-def RM. def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), "mtfsfi $BF, $U, $W", IIC_IntMFFS>; - +let Defs = [CR1] in def MTFSFI_rec : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isRecordForm; @@ -4333,12 +4275,15 @@ def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>; def : InstAlias<"mtfsfi. $BF, $U", (MTFSFI_rec crrc:$BF, i32imm:$U, 0)>; let Predicates = [HasFPU] in { +let Defs = [RM] in { def MTFSF : XFLForm_1<63, 711, (outs), (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W), "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>; +let Defs = [CR1] in def MTFSF_rec : XFLForm_1<63, 711, (outs), (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W), "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isRecordForm; +} def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>; def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSF_rec i32imm:$FLM, f8rc:$FRB, 0, 0)>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index b468a8f318ee3..a92168789bbe0 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -59,6 +59,39 @@ class PI pref, bits<6> opcode, dag OOL, dag IOL, string asmstr, string BaseName = ""; } +// VX-Form: [ PO VT R VB RC XO ] +class VXForm_VTB5_RC xo, bits<5> R, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VT; + bits<5> VB; + bit RC = 0; + + let Pattern = pattern; + + let Inst{6-10} = VT; + let Inst{11-15} = R; + let Inst{16-20} = VB; + let Inst{21} = RC; + let Inst{22-31} = xo; +} + +// Multiclass definition to account for record and non-record form +// instructions of VXRForm. +multiclass VXForm_VTB5_RCr xo, bits<5> R, dag OOL, dag IOL, + string asmbase, string asmstr, + InstrItinClass itin, list pattern> { + let BaseName = asmbase in { + def NAME : VXForm_VTB5_RC, RecFormRel; + let Defs = [CR6] in + def _rec : VXForm_VTB5_RC, isRecordForm, RecFormRel; + } +} + class MLS_DForm_R_SI34_RTA5_MEM opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : PI<1, opcode, OOL, IOL, asmstr, itin> { @@ -454,12 +487,13 @@ multiclass 8LS_DForm_R_SI34_XT6_RA5_p opcode, dag OOL, dag IOL, def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">; def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">; +def PairedVectorMemops : Predicate<"PPCSubTarget->pairedVectorMemops()">; let Predicates = [PrefixInstrs] in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { defm PADDI8 : MLS_DForm_R_SI34_RTA5_p<14, (outs g8rc:$RT), (ins g8rc:$RA, s34imm:$SI), - (ins immZero:$RA, s34imm:$SI), + (ins immZero:$RA, s34imm_pcrel:$SI), "paddi $RT, $RA, $SI", IIC_LdStLFD>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def PLI8 : MLS_DForm_SI34_RT5<14, (outs g8rc:$RT), @@ -469,7 +503,7 @@ let Predicates = [PrefixInstrs] in { } defm PADDI : MLS_DForm_R_SI34_RTA5_p<14, (outs gprc:$RT), (ins gprc:$RA, s34imm:$SI), - (ins immZero:$RA, s34imm:$SI), + (ins immZero:$RA, s34imm_pcrel:$SI), "paddi $RT, $RA, $SI", IIC_LdStLFD>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def PLI : MLS_DForm_SI34_RT5<14, (outs gprc:$RT), @@ -821,6 +855,14 @@ let Predicates = [IsISA3_1] in { (int_ppc_altivec_vsrdbi v16i8:$VRA, v16i8:$VRB, i32:$SH))]>; + defm VSTRIBR : VXForm_VTB5_RCr<13, 1, (outs vrrc:$vT), (ins vrrc:$vB), + "vstribr", "$vT, $vB", IIC_VecGeneral, []>; + defm VSTRIBL : VXForm_VTB5_RCr<13, 0, (outs vrrc:$vT), (ins vrrc:$vB), + "vstribl", "$vT, $vB", IIC_VecGeneral, []>; + defm VSTRIHR : VXForm_VTB5_RCr<13, 3, (outs vrrc:$vT), (ins vrrc:$vB), + "vstrihr", "$vT, $vB", IIC_VecGeneral, []>; + defm VSTRIHL : VXForm_VTB5_RCr<13, 2, (outs vrrc:$vT), (ins vrrc:$vB), + "vstrihl", "$vT, $vB", IIC_VecGeneral, []>; def VINSW : VXForm_1<207, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, gprc:$rB), "vinsw $vD, $rB, $UIM", IIC_VecGeneral, @@ -905,6 +947,38 @@ let Predicates = [IsISA3_1] in { [(set v2i64:$vD, (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA, i64:$rB))]>, RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; + def VEXTDUBVLX : VAForm_1a<24, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextdubvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUBVRX : VAForm_1a<25, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextdubvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUHVLX : VAForm_1a<26, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextduhvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUHVRX : VAForm_1a<27, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextduhvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUWVLX : VAForm_1a<28, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextduwvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDUWVRX : VAForm_1a<29, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextduwvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDDVLX : VAForm_1a<30, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextddvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; + def VEXTDDVRX : VAForm_1a<31, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, g8rc:$rC), + "vextddvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, []>; def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpdepd $vD, $vA, $vB", IIC_VecGeneral, [(set v2i64:$vD, @@ -976,13 +1050,17 @@ let Predicates = [IsISA3_1] in { "vmulld $vD, $vA, $vB", IIC_VecGeneral, [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>; def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulhsw $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulhsw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (mulhs v4i32:$vA, v4i32:$vB))]>; def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulhuw $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulhuw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (mulhu v4i32:$vA, v4i32:$vB))]>; def VMULHSD : VXForm_1<969, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulhsd $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulhsd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (mulhs v2i64:$vA, v2i64:$vB))]>; def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulhud $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulhud $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (mulhu v2i64:$vA, v2i64:$vB))]>; def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vmodsw $vD, $vA, $vB", IIC_VecGeneral, [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>; @@ -1100,6 +1178,17 @@ let Predicates = [IsISA3_1] in { (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>; } +let AddedComplexity = 400, Predicates = [IsISA3_1] in { + def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$rS, 0)), xoaddr:$src), + (STXVRBX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$src)>; + def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$rS, 0)), xoaddr:$src), + (STXVRHX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$src)>; + def : Pat<(store (i32 (vector_extract v4i32:$rS, 0)), xoaddr:$src), + (STXVRWX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$src)>; + def : Pat<(store (i64 (vector_extract v2i64:$rS, 0)), xoaddr:$src), + (STXVRDX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$src)>; +} + let AddedComplexity = 400, Predicates = [PrefixInstrs] in { def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A, i32immNonAllOneNonZero:$A, diff --git a/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/llvm/lib/Target/PowerPC/PPCInstrQPX.td deleted file mode 100644 index 2265af2815cb5..0000000000000 --- a/llvm/lib/Target/PowerPC/PPCInstrQPX.td +++ /dev/null @@ -1,1212 +0,0 @@ -//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file describes the QPX extension to the PowerPC instruction set. -// Reference: -// Book Q: QPX Architecture Definition. IBM (as updated in) 2011. -// -//===----------------------------------------------------------------------===// - -def PPCRegQFRCAsmOperand : AsmOperandClass { - let Name = "RegQFRC"; let PredicateMethod = "isRegNumber"; -} -def qfrc : RegisterOperand { - let ParserMatchClass = PPCRegQFRCAsmOperand; -} -def PPCRegQSRCAsmOperand : AsmOperandClass { - let Name = "RegQSRC"; let PredicateMethod = "isRegNumber"; -} -def qsrc : RegisterOperand { - let ParserMatchClass = PPCRegQSRCAsmOperand; -} -def PPCRegQBRCAsmOperand : AsmOperandClass { - let Name = "RegQBRC"; let PredicateMethod = "isRegNumber"; -} -def qbrc : RegisterOperand { - let ParserMatchClass = PPCRegQBRCAsmOperand; -} - -//===----------------------------------------------------------------------===// -// Helpers for defining instructions that directly correspond to intrinsics. - -// QPXA1_Int - A AForm_1 intrinsic definition. -class QPXA1_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_1; -// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions). -class QPXA1s_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_1; -// QPXA2_Int - A AForm_2 intrinsic definition. -class QPXA2_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_2; -// QPXA3_Int - A AForm_3 intrinsic definition. -class QPXA3_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_3; -// QPXA4_Int - A AForm_4a intrinsic definition. -class QPXA4_Int opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_4a; -// QPXX18_Int - A XForm_18 intrinsic definition. -class QPXX18_Int opcode, bits<10> xo, string opc, Intrinsic IntID> - : XForm_18; -// QPXX19_Int - A XForm_19 intrinsic definition. -class QPXX19_Int opcode, bits<10> xo, string opc, Intrinsic IntID> - : XForm_19; - -//===----------------------------------------------------------------------===// -// Pattern Frags. - -def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; - -def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr), - (truncstore node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; -def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset), - (pre_truncst node:$val, - node:$base, node:$offset), [{ - return cast(N)->getMemoryVT() == MVT::v4f32; -}]>; - -def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{ - return cast(N->getOperand(1))->getZExtValue() == 0; -}]>; - -def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{ - return cast(N->getOperand(1))->getZExtValue() == 1; -}]>; - -let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs. - def u12 : ImmLeaf; - -//===----------------------------------------------------------------------===// -// Instruction Definitions. - -def HasQPX : Predicate<"Subtarget->hasQPX()">; -let Predicates = [HasQPX] in { -let DecoderNamespace = "QPX" in { -let hasSideEffects = 0 in { // QPX instructions don't have side effects. -let Uses = [RM] in { - // Add Instructions - let isCommutable = 1 in { - def QVFADD : AForm_2<4, 21, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>; - def QVFADDSs : AForm_2<0, 21, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>; - } - def QVFSUB : AForm_2<4, 20, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>; - def QVFSUBSs : AForm_2<0, 20, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>; - - // Estimate Instructions - def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfre $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>; - def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>; - let isCodeGenOnly = 1 in - def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfres $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>; - - def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrsqrte $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>; - def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>; - let isCodeGenOnly = 1 in - def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>; - - // Multiply Instructions - let isCommutable = 1 in { - def QVFMUL : AForm_3<4, 25, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC), - "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral, - [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>; - let isCodeGenOnly = 1 in - def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>; - def QVFMULSs : AForm_3<0, 25, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC), - "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral, - [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>; - } - def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>; - def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>; - - // Multiply-add instructions - def QVFMADD : AForm_1<4, 29, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>; - def QVFMADDSs : AForm_1<0, 29, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>; - def QVFNMADD : AForm_1<4, 31, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, - v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>; - def QVFNMADDSs : AForm_1<0, 31, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, - v4f32:$FRB)))]>; - def QVFMSUB : AForm_1<4, 28, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, - (fneg v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>; - def QVFMSUBSs : AForm_1<0, 28, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, - (fneg v4f32:$FRB)))]>; - def QVFNMSUB : AForm_1<4, 30, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, - (fneg v4f64:$FRB))))]>; - let isCodeGenOnly = 1 in - def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>; - def QVFNMSUBSs : AForm_1<0, 30, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, - (fneg v4f32:$FRB))))]>; - def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>; - def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>; - def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>; - def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>; - def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>; - def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>; - def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>; - def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>; - - // Select Instruction - let isCodeGenOnly = 1 in - def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>; - def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT), - (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (vselect v4i1:$FRA, - v4f64:$FRC, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT), - (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (vselect v4i1:$FRA, - v4f32:$FRC, v4f32:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT), - (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4i1:$FRT, (vselect v4i1:$FRA, - v4i1:$FRC, v4i1:$FRB))]>; - - // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after - // instruction selection into a branch sequence. - def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F, - i32imm:$BROPC), "#SELECT_CC_QFRC", - []>; - def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F, - i32imm:$BROPC), "#SELECT_CC_QSRC", - []>; - def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F, - i32imm:$BROPC), "#SELECT_CC_QBRC", - []>; - - // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition - // register bit directly. - def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond, - qfrc:$T, qfrc:$F), "#SELECT_QFRC", - [(set v4f64:$dst, - (select i1:$cond, v4f64:$T, v4f64:$F))]>; - def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond, - qsrc:$T, qsrc:$F), "#SELECT_QSRC", - [(set v4f32:$dst, - (select i1:$cond, v4f32:$T, v4f32:$F))]>; - def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond, - qbrc:$T, qbrc:$F), "#SELECT_QBRC", - [(set v4i1:$dst, - (select i1:$cond, v4i1:$T, v4i1:$F))]>; - - // Convert and Round Instructions - def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>; - let isCodeGenOnly = 1 in - def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfctid $FRT, $FRB", IIC_FPGeneral, []>; - - def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>; - def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>; - def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>; - def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>; - def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>; - def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>; - def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>; - def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>; - let isCodeGenOnly = 1 in - def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>; - - def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>; - def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>; - def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>; - - let isCodeGenOnly = 1 in - def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>; - def QVFRSPs : XForm_19<4, 12, - (outs qsrc:$FRT), (ins qfrc:$FRB), - "qvfrsp $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>; - - def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfriz $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfriz $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>; - - def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrin $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fround v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrin $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fround v4f32:$FRB))]>; - - def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrip $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fceil v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrip $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fceil v4f32:$FRB))]>; - - def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrim $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrim $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>; - - // Move Instructions - def QVFMR : XForm_19<4, 72, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4f64:$FRT, v4f64:$FRB) */]>; - let isCodeGenOnly = 1 in { - def QVFMRs : XForm_19<4, 72, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4f32:$FRT, v4f32:$FRB) */]>; - def QVFMRb : XForm_19<4, 72, - (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4i1:$FRT, v4i1:$FRB) */]>; - } - def QVFNEG : XForm_19<4, 40, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfneg $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fneg v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFNEGs : XForm_19<4, 40, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfneg $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fneg v4f32:$FRB))]>; - def QVFABS : XForm_19<4, 264, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfabs $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fabs v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFABSs : XForm_19<4, 264, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfabs $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fabs v4f32:$FRB))]>; - def QVFNABS : XForm_19<4, 136, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfnabs $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFNABSs : XForm_19<4, 136, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfnabs $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>; - def QVFCPSGN : XForm_18<4, 8, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>; - let isCodeGenOnly = 1 in - def QVFCPSGNs : XForm_18<4, 8, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>; - - def QVALIGNI : Z23Form_1<4, 5, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvaligni v4f64:$FRA, v4f64:$FRB, - (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVALIGNIs : Z23Form_1<4, 5, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvaligni v4f32:$FRA, v4f32:$FRB, - (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVALIGNIb : Z23Form_1<4, 5, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4i1:$FRT, - (PPCqvaligni v4i1:$FRA, v4i1:$FRB, - (i32 imm:$idx)))]>; - - def QVESPLATI : Z23Form_2<4, 37, - (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVESPLATIs : Z23Form_2<4, 37, - (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVESPLATIb : Z23Form_2<4, 37, - (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4i1:$FRT, - (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>; - - def QVFPERM : AForm_1<4, 6, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), - "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; - let isCodeGenOnly = 1 in - def QVFPERMs : AForm_1<4, 6, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC), - "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>; - - let isReMaterializable = 1, isAsCheapAsAMove = 1 in - def QVGPCI : Z23Form_3<4, 133, - (outs qfrc:$FRT), (ins u12imm:$idx), - "qvgpci $FRT, $idx", IIC_VecPerm, - [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>; - - // Compare Instruction - let isCodeGenOnly = 1 in - def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>; - def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>; - let isCodeGenOnly = 1 in - def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>; - let isCodeGenOnly = 1 in - def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>; - def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>; - let isCodeGenOnly = 1 in - def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>; - let isCodeGenOnly = 1 in - def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>; - def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>; - let isCodeGenOnly = 1 in - def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>; - let isCodeGenOnly = 1 in - def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>; - def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>; - let isCodeGenOnly = 1 in - def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>; - - let isCodeGenOnly = 1 in - def QVFLOGICAL : XForm_20<4, 4, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - def QVFLOGICALb : XForm_20<4, 4, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - let isCodeGenOnly = 1 in - def QVFLOGICALs : XForm_20<4, 4, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - - // Load indexed instructions - let mayLoad = 1 in { - def QVLFDX : XForm_1_memOp<31, 583, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfdx $FRT, $src", IIC_LdStLFD, - [(set v4f64:$FRT, (load xoaddr:$src))]>; - let isCodeGenOnly = 1 in - def QVLFDXb : XForm_1_memOp<31, 583, - (outs qbrc:$FRT), (ins memrr:$src), - "qvlfdx $FRT, $src", IIC_LdStLFD, []>; - - let RC = 1 in - def QVLFDXA : XForm_1<31, 583, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfdxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFDUX : XForm_1<31, 615, - (outs qfrc:$FRT, ptr_rc_nor0:$ea_result), - (ins memrr:$src), - "qvlfdux $FRT, $src", IIC_LdStLFDU, []>, - RegConstraint<"$src.ptrreg = $ea_result">, - NoEncode<"$ea_result">; - let RC = 1 in - def QVLFDUXA : XForm_1<31, 615, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfduxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFSX : XForm_1_memOp<31, 519, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>; - - let isCodeGenOnly = 1 in - def QVLFSXb : XForm_1<31, 519, - (outs qbrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>; - let isCodeGenOnly = 1 in - def QVLFSXs : XForm_1_memOp<31, 519, - (outs qsrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4f32:$FRT, (load xoaddr:$src))]>; - - let RC = 1 in - def QVLFSXA : XForm_1<31, 519, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFSUX : XForm_1<31, 551, - (outs qsrc:$FRT, ptr_rc_nor0:$ea_result), - (ins memrr:$src), - "qvlfsux $FRT, $src", IIC_LdStLFDU, []>, - RegConstraint<"$src.ptrreg = $ea_result">, - NoEncode<"$ea_result">; - - let RC = 1 in - def QVLFSUXA : XForm_1<31, 551, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCDX : XForm_1<31, 71, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdx $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCDXA : XForm_1<31, 71, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCDUX : XForm_1<31, 103, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdux $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCDUXA : XForm_1<31, 103, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCSX : XForm_1<31, 7, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; - let isCodeGenOnly = 1 in - def QVLFCSXs : XForm_1<31, 7, - (outs qsrc:$FRT), (ins memrr:$src), - "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; - - let RC = 1 in - def QVLFCSXA : XForm_1<31, 7, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCSUX : XForm_1<31, 39, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsux $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCSUXA : XForm_1<31, 39, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFIWAX : XForm_1<31, 871, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwax $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFIWAXA : XForm_1<31, 871, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFIWZX : XForm_1<31, 839, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFIWZXA : XForm_1<31, 839, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>; - } - - - def QVLPCLDX : XForm_1<31, 582, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcldx $FRT, $src", IIC_LdStLFD, []>; - def QVLPCLSX : XForm_1<31, 518, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpclsx $FRT, $src", IIC_LdStLFD, []>; - let isCodeGenOnly = 1 in - def QVLPCLSXint : XForm_11<31, 518, - (outs qfrc:$FRT), (ins G8RC:$src), - "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>; - def QVLPCRDX : XForm_1<31, 70, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>; - def QVLPCRSX : XForm_1<31, 6, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>; - - // Store indexed instructions - let mayStore = 1 in { - def QVSTFDX : XForm_8_memOp<31, 711, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdx $FRT, $dst", IIC_LdStSTFD, - [(store qfrc:$FRT, xoaddr:$dst)]>; - let isCodeGenOnly = 1 in - def QVSTFDXb : XForm_8_memOp<31, 711, - (outs), (ins qbrc:$FRT, memrr:$dst), - "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>; - - let RC = 1 in - def QVSTFDXA : XForm_8<31, 711, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res), - (ins qfrc:$FRT, memrr:$dst), - "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - - let RC = 1 in - def QVSTFDUXA : XForm_8<31, 743, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDXI : XForm_8<31, 709, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFDXIA : XForm_8<31, 709, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDUXI : XForm_8<31, 741, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFDUXIA : XForm_8<31, 741, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSX : XForm_8_memOp<31, 647, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsx $FRT, $dst", IIC_LdStSTFD, - [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>; - let isCodeGenOnly = 1 in - def QVSTFSXs : XForm_8_memOp<31, 647, - (outs), (ins qsrc:$FRT, memrr:$dst), - "qvstfsx $FRT, $dst", IIC_LdStSTFD, - [(store qsrc:$FRT, xoaddr:$dst)]>; - - let RC = 1 in - def QVSTFSXA : XForm_8<31, 647, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), - (ins qsrc:$FRT, memrr:$dst), - "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - let isCodeGenOnly = 1 in - def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), - (ins qfrc:$FRT, memrr:$dst), - "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - - let RC = 1 in - def QVSTFSUXA : XForm_8<31, 679, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSXI : XForm_8<31, 645, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFSXIA : XForm_8<31, 645, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSUXI : XForm_8<31, 677, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFSUXIA : XForm_8<31, 677, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDX : XForm_8<31, 199, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDXA : XForm_8<31, 199, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSX : XForm_8<31, 135, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; - let isCodeGenOnly = 1 in - def QVSTFCSXs : XForm_8<31, 135, - (outs), (ins qsrc:$FRT, memrr:$dst), - "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; - - let RC = 1 in - def QVSTFCSXA : XForm_8<31, 135, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDUX : XForm_8<31, 231, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDUXA : XForm_8<31, 231, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSUX : XForm_8<31, 167, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSUXA : XForm_8<31, 167, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDXI : XForm_8<31, 197, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDXIA : XForm_8<31, 197, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSXI : XForm_8<31, 133, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSXIA : XForm_8<31, 133, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDUXI : XForm_8<31, 229, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDUXIA : XForm_8<31, 229, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSUXI : XForm_8<31, 165, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSUXIA : XForm_8<31, 165, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFIWX : XForm_8<31, 967, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFIWXA : XForm_8<31, 967, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>; - } -} - -} // neverHasSideEffects -} - -def : InstAlias<"qvfclr $FRT", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>; -def : InstAlias<"qvfand $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>; -def : InstAlias<"qvfandc $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>; -def : InstAlias<"qvfctfb $FRT, $FRA", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>; -def : InstAlias<"qvfxor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>; -def : InstAlias<"qvfor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>; -def : InstAlias<"qvfnor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>; -def : InstAlias<"qvfequ $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>; -def : InstAlias<"qvfnot $FRT, $FRA", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>; -def : InstAlias<"qvforc $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>; -def : InstAlias<"qvfnand $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>; -def : InstAlias<"qvfset $FRT", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>; - -//===----------------------------------------------------------------------===// -// Additional QPX Patterns -// - -def : Pat<(v4f64 (scalar_to_vector f64:$A)), - (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>; -def : Pat<(v4f32 (scalar_to_vector f32:$A)), - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, 0)), - (EXTRACT_SUBREG $S, sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 0)), - (EXTRACT_SUBREG $S, sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, 1)), - (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>; -def : Pat<(f64 (extractelt v4f64:$S, 2)), - (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>; -def : Pat<(f64 (extractelt v4f64:$S, 3)), - (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>; - -def : Pat<(f32 (extractelt v4f32:$S, 1)), - (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 2)), - (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 3)), - (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, i64:$F)), - (EXTRACT_SUBREG (QVFPERM $S, $S, - (QVLPCLSXint (RLDICR $F, 2, - /* 63-2 = */ 61))), - sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, i64:$F)), - (EXTRACT_SUBREG (QVFPERMs $S, $S, - (QVLPCLSXint (RLDICR $F, 2, - /* 63-2 = */ 61))), - sub_64)>; - -def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C), - (QVFPERM $A, $B, $C)>; - -def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B), - (QVFCPSGN $A, $B)>; - -// FCOPYSIGN's operand types need not agree. -def : Pat<(fcopysign v4f64:$frB, v4f32:$frA), - (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>; -def : Pat<(fcopysign QSRC:$frB, QFRC:$frA), - (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>; - -def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>; -def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>; -def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>; - -def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>; -def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>; -def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>; -def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>; - -def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>; -def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>; - -def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B), - (QVFADD $A, $B)>; -def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B), - (QVFSUB $A, $B)>; -def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B), - (QVFMUL $A, $B)>; - -// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b) -def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B), - (QVFNMSUB $A, $C, $B)>; -def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B), - (QVFNMSUB $A, $C, $B)>; -def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), - (QVFNMSUBSs $A, $C, $B)>; -def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), - (QVFNMSUBSs $A, $C, $B)>; - -def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C), - (QVFMADD $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C), - (QVFNMADD $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C), - (QVFMSUB $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C), - (QVFNMSUB $A, $B, $C)>; - -def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src), - (QVLFDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), - (QVLFDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src), - (QVLFSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), - (QVLFSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src), - (QVLFCDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src), - (QVLFCDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src), - (QVLFCSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src), - (QVLFCSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), - (QVLFDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src), - (QVLFIWAXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src), - (QVLFIWAX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src), - (QVLFIWZXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src), - (QVLFIWZX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), - (QVLFSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src), - (QVLPCLDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src), - (QVLPCLSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src), - (QVLPCRDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src), - (QVLPCRSX xoaddr:$src)>; - -def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst), - (QVSTFDX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst), - (QVSTFSX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst), - (QVSTFCDXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst), - (QVSTFCDX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst), - (QVSTFCSXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst), - (QVSTFCSX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst), - (QVSTFDXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst), - (QVSTFIWXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst), - (QVSTFIWX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst), - (QVSTFSXA $T, xoaddr:$dst)>; - -def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFDUX $rS, $ptrreg, $ptroff)>; -def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFSUX $rS, $ptrreg, $ptroff)>; -def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFSUXs $rS, $ptrreg, $ptroff)>; - -def : Pat<(int_ppc_qpx_qvflogical v4f64:$A, v4f64:$B, (i32 imm:$idx)), - (QVFLOGICAL $A, $B, imm:$idx)>; -def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)), - (QVGPCI imm:$idx)>; - -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPLTb $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPGTb $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPEQb $FRA, $FRB), (i32 13))>; - -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ), - (QVFCMPEQb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT), - (QVFCMPGTb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFCMPLTb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT), - (QVFCMPLTb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFCMPGTb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFCMPEQb $FRA, $FRB), (i32 10))>; - -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPLTbs $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPGTbs $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPEQbs $FRA, $FRB), (i32 13))>; - -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ), - (QVFCMPEQbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT), - (QVFCMPGTbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFCMPLTbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT), - (QVFCMPLTbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFCMPGTbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFCMPEQbs $FRA, $FRB), (i32 10))>; - -def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 4))>; -def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 8))>; -def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 9))>; -def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 13))>; -def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 14))>; - -def : Pat<(and v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 1))>; -def : Pat<(or v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 7))>; -def : Pat<(xor v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 6))>; -def : Pat<(not v4i1:$FRA), - (QVFLOGICALb $FRA, $FRA, (i32 10))>; - -def : Pat<(v4f64 (fpextend v4f32:$src)), - (COPY_TO_REGCLASS $src, QFRC)>; - -def : Pat<(v4f32 (fround_exact v4f64:$src)), - (COPY_TO_REGCLASS $src, QSRC)>; - -// Extract the underlying floating-point values from the -// QPX (-1.0, 1.0) boolean representation. -def : Pat<(v4f64 (PPCqbflt v4i1:$src)), - (COPY_TO_REGCLASS $src, QFRC)>; - -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)), - (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)), - (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)), - (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)), - (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)), - (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)), - (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)), - (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)), - (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)), - (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)), - (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)), - (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)), - (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)), - (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)), - (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)), - (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)), - (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)), - (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)), - (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)), - (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)), - (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)), - (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)), - (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)), - (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)), - (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)), - (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)), - (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)), - (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)), - (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)), - (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)), - (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -} // end HasQPX - -let Predicates = [HasQPX, NoNaNsFPMath] in { -def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>; -def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>; - -def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>; -def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>; -} - -let Predicates = [HasQPX, NaNsFPMath] in { -// When either of these operands is NaN, we should return the other operand. -// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need -// to explicitly or with a NaN test on the second operand. -def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; - -def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -} diff --git a/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/llvm/lib/Target/PowerPC/PPCInstrSPE.td index 858eb0c9fe500..299b34ca82836 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrSPE.td +++ b/llvm/lib/Target/PowerPC/PPCInstrSPE.td @@ -819,16 +819,6 @@ def SPESTWX : XForm_8<31, 151, (outs), (ins spe4rc:$rS, memrr:$dst), } // HasSPE -let Predicates = [HasSPE] in { -def : Pat<(f64 (extloadf32 iaddr:$src)), - (COPY_TO_REGCLASS (SPELWZ iaddr:$src), SPERC)>; -def : Pat<(f64 (extloadf32 xaddr:$src)), - (COPY_TO_REGCLASS (SPELWZX xaddr:$src), SPERC)>; - -def : Pat<(f64 (fpextend f32:$src)), - (COPY_TO_REGCLASS $src, SPERC)>; -} - let Predicates = [HasSPE] in { def SELECT_CC_SPE4 : PPCCustomInserterPseudo<(outs spe4rc:$dst), (ins crrc:$cond, spe4rc:$T, spe4rc:$F, diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 9ba5058a6f812..272f8b1c0aac1 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -362,7 +362,8 @@ let hasSideEffects = 0 in { } } // mayStore - let Uses = [RM], mayRaiseFPException = 1 in { + let mayRaiseFPException = 1 in { + let Uses = [RM] in { // Add/Mul Instructions let isCommutable = 1 in { def XSADDDP : XX3Form<60, 32, @@ -885,15 +886,61 @@ let hasSideEffects = 0 in { "xvcvuxwsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>; - // Rounding Instructions - def XSRDPI : XX2Form<60, 73, - (outs vsfrc:$XT), (ins vsfrc:$XB), - "xsrdpi $XT, $XB", IIC_VecFP, - [(set f64:$XT, (any_fround f64:$XB))]>; + // Rounding Instructions respecting current rounding mode def XSRDPIC : XX2Form<60, 107, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpic $XT, $XB", IIC_VecFP, [(set f64:$XT, (any_fnearbyint f64:$XB))]>; + def XVRDPIC : XX2Form<60, 235, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpic $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (any_fnearbyint v2f64:$XB))]>; + def XVRSPIC : XX2Form<60, 171, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspic $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (any_fnearbyint v4f32:$XB))]>; + // Max/Min Instructions + let isCommutable = 1 in { + def XSMAXDP : XX3Form<60, 160, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmaxdp $XT, $XA, $XB", IIC_VecFP, + [(set vsfrc:$XT, + (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>; + def XSMINDP : XX3Form<60, 168, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmindp $XT, $XA, $XB", IIC_VecFP, + [(set vsfrc:$XT, + (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>; + + def XVMAXDP : XX3Form<60, 224, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmaxdp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>; + def XVMINDP : XX3Form<60, 232, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmindp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>; + + def XVMAXSP : XX3Form<60, 192, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmaxsp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>; + def XVMINSP : XX3Form<60, 200, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvminsp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>; + } // isCommutable + } // Uses = [RM] + + // Rounding Instructions with static direction. + def XSRDPI : XX2Form<60, 73, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpi $XT, $XB", IIC_VecFP, + [(set f64:$XT, (any_fround f64:$XB))]>; def XSRDPIM : XX2Form<60, 121, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpim $XT, $XB", IIC_VecFP, @@ -911,10 +958,6 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpi $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (any_fround v2f64:$XB))]>; - def XVRDPIC : XX2Form<60, 235, - (outs vsrc:$XT), (ins vsrc:$XB), - "xvrdpic $XT, $XB", IIC_VecFP, - [(set v2f64:$XT, (any_fnearbyint v2f64:$XB))]>; def XVRDPIM : XX2Form<60, 249, (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpim $XT, $XB", IIC_VecFP, @@ -932,10 +975,6 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XB), "xvrspi $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (any_fround v4f32:$XB))]>; - def XVRSPIC : XX2Form<60, 171, - (outs vsrc:$XT), (ins vsrc:$XB), - "xvrspic $XT, $XB", IIC_VecFP, - [(set v4f32:$XT, (any_fnearbyint v4f32:$XB))]>; def XVRSPIM : XX2Form<60, 185, (outs vsrc:$XT), (ins vsrc:$XB), "xvrspim $XT, $XB", IIC_VecFP, @@ -948,43 +987,7 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XB), "xvrspiz $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (any_ftrunc v4f32:$XB))]>; - - // Max/Min Instructions - let isCommutable = 1 in { - def XSMAXDP : XX3Form<60, 160, - (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), - "xsmaxdp $XT, $XA, $XB", IIC_VecFP, - [(set vsfrc:$XT, - (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>; - def XSMINDP : XX3Form<60, 168, - (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), - "xsmindp $XT, $XA, $XB", IIC_VecFP, - [(set vsfrc:$XT, - (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>; - - def XVMAXDP : XX3Form<60, 224, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xvmaxdp $XT, $XA, $XB", IIC_VecFP, - [(set vsrc:$XT, - (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>; - def XVMINDP : XX3Form<60, 232, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xvmindp $XT, $XA, $XB", IIC_VecFP, - [(set vsrc:$XT, - (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>; - - def XVMAXSP : XX3Form<60, 192, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xvmaxsp $XT, $XA, $XB", IIC_VecFP, - [(set vsrc:$XT, - (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>; - def XVMINSP : XX3Form<60, 200, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xvminsp $XT, $XA, $XB", IIC_VecFP, - [(set vsrc:$XT, - (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>; - } // isCommutable - } // Uses = [RM], mayRaiseFPException + } // mayRaiseFPException // Logical Instructions let isCommutable = 1 in diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index a7546d2be5d83..0068ad7174ee7 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -606,6 +606,10 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain, NewBasePtr = NewPHI; } + // Clear the rewriter cache, because values that are in the rewriter's cache + // can be deleted below, causing the AssertingVH in the cache to trigger. + SCEVE.clear(); + if (Instruction *IDel = dyn_cast(BasePtr)) BBChanged.insert(IDel->getParent()); BasePtr->replaceAllUsesWith(NewBasePtr); diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp index 2b0e604e0ccde..a61e1f83705e6 100644 --- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp +++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp @@ -16,6 +16,7 @@ #include "PPC.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Instructions.h" @@ -64,8 +65,7 @@ class PPCLowerMASSVEntries : public ModulePass { /// Checks if the specified function name represents an entry in the MASSV /// library. bool PPCLowerMASSVEntries::isMASSVFunc(StringRef Name) { - auto Iter = std::find(std::begin(MASSVFuncs), std::end(MASSVFuncs), Name); - return Iter != std::end(MASSVFuncs); + return llvm::is_contained(MASSVFuncs, Name); } // FIXME: diff --git a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp deleted file mode 100644 index 6e90426438208..0000000000000 --- a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp +++ /dev/null @@ -1,161 +0,0 @@ -//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The QPX vector registers overlay the scalar floating-point registers, and -// any scalar floating-point loads splat their value across all vector lanes. -// Thus, if we have a scalar load followed by a splat, we can remove the splat -// (i.e. replace the load with a load-and-splat pseudo instruction). -// -// This pass must run after anything that might do store-to-load forwarding. -// -//===----------------------------------------------------------------------===// - -#include "PPC.h" -#include "PPCInstrBuilder.h" -#include "PPCInstrInfo.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "ppc-qpx-load-splat" - -STATISTIC(NumSimplified, "Number of QPX load splats simplified"); - -namespace { - struct PPCQPXLoadSplat : public MachineFunctionPass { - static char ID; - PPCQPXLoadSplat() : MachineFunctionPass(ID) { - initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &Fn) override; - - StringRef getPassName() const override { - return "PowerPC QPX Load Splat Simplification"; - } - }; - char PPCQPXLoadSplat::ID = 0; -} - -INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat", - "PowerPC QPX Load Splat Simplification", - false, false) - -FunctionPass *llvm::createPPCQPXLoadSplatPass() { - return new PPCQPXLoadSplat(); -} - -bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - bool MadeChange = false; - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - - for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) { - MachineBasicBlock *MBB = &*MFI; - SmallVector Splats; - - for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) { - MachineInstr *MI = &*MBBI; - - if (MI->hasUnmodeledSideEffects() || MI->isCall()) { - Splats.clear(); - continue; - } - - // We're looking for a sequence like this: - // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2) - // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm - - for (auto SI = Splats.begin(); SI != Splats.end();) { - MachineInstr *SMI = *SI; - Register SplatReg = SMI->getOperand(0).getReg(); - Register SrcReg = SMI->getOperand(1).getReg(); - - if (MI->modifiesRegister(SrcReg, TRI)) { - switch (MI->getOpcode()) { - default: - SI = Splats.erase(SI); - continue; - case PPC::LFS: - case PPC::LFD: - case PPC::LFSU: - case PPC::LFDU: - case PPC::LFSUX: - case PPC::LFDUX: - case PPC::LFSX: - case PPC::LFDX: - case PPC::LFIWAX: - case PPC::LFIWZX: - if (SplatReg != SrcReg) { - // We need to change the load to define the scalar subregister of - // the QPX splat source register. - unsigned SubRegIndex = - TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg()); - Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex); - - // Substitute both the explicit defined register, and also the - // implicit def of the containing QPX register. - MI->getOperand(0).setReg(SplatSubReg); - MI->substituteRegister(SrcReg, SplatReg, 0, *TRI); - } - - SI = Splats.erase(SI); - - // If SMI is directly after MI, then MBBI's base iterator is - // pointing at SMI. Adjust MBBI around the call to erase SMI to - // avoid invalidating MBBI. - ++MBBI; - SMI->eraseFromParent(); - --MBBI; - - ++NumSimplified; - MadeChange = true; - continue; - } - } - - // If this instruction defines the splat register, then we cannot move - // the previous definition above it. If it reads from the splat - // register, then it must already be alive from some previous - // definition, and if the splat register is different from the source - // register, then this definition must not be the load for which we're - // searching. - if (MI->modifiesRegister(SplatReg, TRI) || - (SrcReg != SplatReg && - MI->readsRegister(SplatReg, TRI))) { - SI = Splats.erase(SI); - continue; - } - - ++SI; - } - - if (MI->getOpcode() != PPC::QVESPLATI && - MI->getOpcode() != PPC::QVESPLATIs && - MI->getOpcode() != PPC::QVESPLATIb) - continue; - if (MI->getOperand(2).getImm() != 0) - continue; - - // If there are other uses of the scalar value after this, replacing - // those uses might be non-trivial. - if (!MI->getOperand(1).isKill()) - continue; - - Splats.push_back(MI); - } - } - - return MadeChange; -} diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index ed8948a639728..96666ad58dfe5 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -404,9 +404,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } case PPC::F8RCRegClassID: case PPC::F4RCRegClassID: - case PPC::QFRCRegClassID: - case PPC::QSRCRegClassID: - case PPC::QBRCRegClassID: case PPC::VRRCRegClassID: case PPC::VFRCRegClassID: case PPC::VSLRCRegClassID: diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 61acd955e1cba..a931967862c7b 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -153,7 +153,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { switch (RegName[0]) { case 'r': case 'f': - case 'q': // for QPX case 'v': if (RegName[1] == 's') return RegName + 2; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index b45757c1acc5e..e07b960ae305b 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -54,13 +54,6 @@ class FPR num, string n> : PPCReg { let HWEncoding{4-0} = num; } -// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX) -class QFPR : PPCReg { - let HWEncoding = SubReg.HWEncoding; - let SubRegs = [SubReg]; - let SubRegIndices = [sub_64]; -} - // VF - One of the 32 64-bit floating-point subregisters of the vector // registers (used by VSX). class VF num, string n> : PPCReg { @@ -132,12 +125,6 @@ foreach Index = 0-31 in { DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>; } -// QPX Floating-point registers -foreach Index = 0-31 in { - def QF#Index : QFPR("F"#Index), "q"#Index>, - DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; -} - // Vector registers foreach Index = 0-31 in { def V#Index : VR("VF"#Index), "v"#Index>, @@ -343,16 +330,6 @@ def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC, // Register class for single precision scalars in VSX registers def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>; -// For QPX -def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13), - (sequence "QF%u", 31, 14))>; -def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>; -def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> { - // These are actually stored as floating-point values where a positive - // number is true and anything else (including NaN) is false. - let Size = 256; -} - def CRBITRC : RegisterClass<"PPC", [i1], 32, (add CR2LT, CR2GT, CR2EQ, CR2UN, CR3LT, CR3GT, CR3EQ, CR3UN, diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td index 0a1ae7e55b3c2..c79d55f56b2a6 100644 --- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td +++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td @@ -40,12 +40,11 @@ def P9Model : SchedMachineModel { let CompleteModel = 1; - // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing - // Engine), prefixed instructions on Power 9, PC relative mem ops, or - // instructions introduced in ISA 3.1. - let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops, - IsISA3_1]; - + // Do not support SPE (Signal Processing Engine), prefixed instructions on + // Power 9, paired vector mem ops, PC relative mem ops, or instructions + // introduced in ISA 3.1. + let UnsupportedFeatures = [HasSPE, PrefixInstrs, PairedVectorMemops, + PCRelativeMemops, IsISA3_1]; } let SchedModel = P9Model in { diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 3836cc960394f..6577b37f908f2 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -35,10 +35,6 @@ using namespace llvm; static cl::opt UseSubRegLiveness("ppc-track-subreg-liveness", cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden); -static cl::opt QPXStackUnaligned("qpx-stack-unaligned", - cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"), - cl::Hidden); - static cl::opt EnableMachinePipeliner("ppc-enable-pipeliner", cl::desc("Enable Machine Pipeliner for PPC"), @@ -70,7 +66,6 @@ void PPCSubtarget::initializeEnvironment() { HasAltivec = false; HasSPE = false; HasFPU = false; - HasQPX = false; HasVSX = false; NeedsTwoConstNR = false; HasP8Vector = false; @@ -109,7 +104,6 @@ void PPCSubtarget::initializeEnvironment() { HasInvariantFunctionDescriptors = false; HasPartwordAtomics = false; HasDirectMove = false; - IsQPXStackUnaligned = false; HasHTM = false; HasFloat128 = false; HasFusion = false; @@ -122,6 +116,7 @@ void PPCSubtarget::initializeEnvironment() { VectorsUseTwoUnits = false; UsePPCPreRASchedStrategy = false; UsePPCPostRASchedStrategy = false; + PairedVectorMemops = false; PredictableSelectIsExpensive = false; HasPOPCNTD = POPCNTD_Unavailable; @@ -158,7 +153,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (HasSPE && IsPPC64) report_fatal_error( "SPE is only supported for 32-bit targets.\n", false); - if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU)) + if (HasSPE && (HasAltivec || HasVSX || HasFPU)) report_fatal_error( "SPE and traditional floating point cannot both be enabled.\n", false); @@ -166,10 +161,6 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (!HasSPE) HasFPU = true; - // QPX requires a 32-byte aligned stack. Note that we need to do this if - // we're compiling for a BG/Q system regardless of whether or not QPX - // is enabled because external functions will assume this alignment. - IsQPXStackUnaligned = QPXStackUnaligned; StackAlignment = getPlatformStackAlignment(); // Determine endianness. diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index ec329022c4572..a457214e86302 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -97,7 +97,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool HasAltivec; bool HasFPU; bool HasSPE; - bool HasQPX; bool HasVSX; bool NeedsTwoConstNR; bool HasP8Vector; @@ -146,15 +145,11 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool VectorsUseTwoUnits; bool UsePPCPreRASchedStrategy; bool UsePPCPostRASchedStrategy; + bool PairedVectorMemops; bool PredictableSelectIsExpensive; POPCNTDKind HasPOPCNTD; - /// When targeting QPX running a stock PPC64 Linux kernel where the stack - /// alignment has not been changed, we need to keep the 16-byte alignment - /// of the stack. - bool IsQPXStackUnaligned; - const PPCTargetMachine &TM; PPCFrameLowering FrameLowering; PPCInstrInfo InstrInfo; @@ -255,7 +250,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasAltivec() const { return HasAltivec; } bool hasSPE() const { return HasSPE; } bool hasFPU() const { return HasFPU; } - bool hasQPX() const { return HasQPX; } bool hasVSX() const { return HasVSX; } bool needsTwoConstNR() const { return NeedsTwoConstNR; } bool hasP8Vector() const { return HasP8Vector; } @@ -266,6 +260,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasP10Vector() const { return HasP10Vector; } bool hasPrefixInstrs() const { return HasPrefixInstrs; } bool hasPCRelativeMemops() const { return HasPCRelativeMemops; } + bool pairedVectorMemops() const { return PairedVectorMemops; } bool hasMFOCRF() const { return HasMFOCRF; } bool hasISEL() const { return HasISEL; } bool hasBPERMD() const { return HasBPERMD; } @@ -291,11 +286,7 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool hasPartwordAtomics() const { return HasPartwordAtomics; } bool hasDirectMove() const { return HasDirectMove; } - bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; } Align getPlatformStackAlignment() const { - if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned()) - return Align(32); - return Align(16); } @@ -325,9 +316,6 @@ class PPCSubtarget : public PPCGenSubtargetInfo { const Triple &getTargetTriple() const { return TargetTriple; } - /// isBGQ - True if this is a BG/Q platform. - bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; } - bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index f15f9c7f49429..27de5b29cd341 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -63,10 +63,6 @@ static cl:: opt DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); -static cl:: -opt DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden, - cl::desc("Disable QPX load splat simplification")); - static cl:: opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden, cl::desc("Disable machine peepholes for PPC")); @@ -114,7 +110,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { initializePPCReduceCRLogicalsPass(PR); initializePPCBSelPass(PR); initializePPCBranchCoalescingPass(PR); - initializePPCQPXLoadSplatPass(PR); initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCPreEmitPeepholePass(PR); @@ -411,14 +406,9 @@ void PPCPassConfig::addIRPasses() { // Lower generic MASSV routines to PowerPC subtarget-specific entries. addPass(createPPCLowerMASSVEntriesPass()); - - // For the BG/Q (or if explicitly requested), add explicit data prefetch - // intrinsics. - bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && - getOptLevel() != CodeGenOpt::None; + + // If explicitly requested, add explicit data prefetch intrinsics. if (EnablePrefetch.getNumOccurrences() > 0) - UsePrefetching = EnablePrefetch; - if (UsePrefetching) addPass(createLoopDataPrefetchPass()); if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) { @@ -515,15 +505,8 @@ void PPCPassConfig::addPreRegAlloc() { } void PPCPassConfig::addPreSched2() { - if (getOptLevel() != CodeGenOpt::None) { + if (getOptLevel() != CodeGenOpt::None) addPass(&IfConverterID); - - // This optimization must happen after anything that might do store-to-load - // forwarding. Here we're after RA (and, thus, when spills are inserted) - // but before post-RA scheduling. - if (!DisableQPXLoadSplat) - addPass(createPPCQPXLoadSplatPass()); - } } void PPCPassConfig::addPreEmitPass() { diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/llvm/lib/Target/PowerPC/PPCTargetMachine.h index fd1d14ae32d4a..21faa4e710e3e 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.h +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.h @@ -58,6 +58,11 @@ class PPCTargetMachine final : public LLVMTargetMachine { const Triple &TT = getTargetTriple(); return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); }; + + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index bbb4239d36da5..8434fdebe1b07 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -8,6 +8,7 @@ #include "PPCTargetTransformInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" @@ -18,6 +19,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" + using namespace llvm; #define DEBUG_TYPE "ppctti" @@ -25,8 +27,7 @@ using namespace llvm; static cl::opt DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); -// This is currently only used for the data prefetch pass which is only enabled -// for BG/Q by default. +// This is currently only used for the data prefetch pass static cl::opt CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), cl::desc("The loop prefetch cache line size")); @@ -104,55 +105,6 @@ PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1)); } - case Intrinsic::ppc_qpx_qvlfs: - // Turn PPC QPX qvlfs -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(0), Align(16), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { - Type *VTy = - VectorType::get(IC.Builder.getFloatTy(), - cast(II.getType())->getElementCount()); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0), - PointerType::getUnqual(VTy)); - Value *Load = IC.Builder.CreateLoad(VTy, Ptr); - return new FPExtInst(Load, II.getType()); - } - break; - case Intrinsic::ppc_qpx_qvlfd: - // Turn PPC QPX qvlfd -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(0), Align(32), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { - Value *Ptr = IC.Builder.CreateBitCast( - II.getArgOperand(0), PointerType::getUnqual(II.getType())); - return new LoadInst(II.getType(), Ptr, "", false, Align(32)); - } - break; - case Intrinsic::ppc_qpx_qvstfs: - // Turn PPC QPX qvstfs -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(1), Align(16), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { - Type *VTy = VectorType::get( - IC.Builder.getFloatTy(), - cast(II.getArgOperand(0)->getType())->getElementCount()); - Value *TOp = IC.Builder.CreateFPTrunc(II.getArgOperand(0), VTy); - Type *OpPtrTy = PointerType::getUnqual(VTy); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); - return new StoreInst(TOp, Ptr, false, Align(16)); - } - break; - case Intrinsic::ppc_qpx_qvstfd: - // Turn PPC QPX qvstfd -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment( - II.getArgOperand(1), Align(32), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { - Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); - Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); - return new StoreInst(II.getArgOperand(0), Ptr, false, Align(32)); - } - break; - case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -161,7 +113,7 @@ PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // the permutation mask with respect to 31 and reverse the order of // V1 and V2. if (Constant *Mask = dyn_cast(II.getArgOperand(2))) { - assert(cast(Mask->getType())->getNumElements() == 16 && + assert(cast(Mask->getType())->getNumElements() == 16 && "Bad type for intrinsic!"); // Check that all of the elements are integer constants or undefs. @@ -736,10 +688,7 @@ bool PPCTTIImpl::useColdCCForColdCall(Function &F) { } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { - // On the A2, always unroll aggressively. For QPX unaligned loads, we depend - // on combining the loads generated for consecutive accesses, and failure to - // do so is particularly expensive. This makes it much more likely (compared - // to only using concatenation unrolling). + // On the A2, always unroll aggressively. if (ST->getCPUDirective() == PPC::DIR_A2) return true; @@ -799,7 +748,6 @@ const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const { unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { - if (ST->hasQPX()) return 256; if (ST->hasAltivec()) return 128; return 0; } @@ -828,8 +776,6 @@ unsigned PPCTTIImpl::getCacheLineSize() const { } unsigned PPCTTIImpl::getPrefetchDistance() const { - // This seems like a reasonable default for the BG/Q (this pass is enabled, by - // default, only on the BG/Q). return 300; } @@ -918,7 +864,7 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the @@ -935,11 +881,12 @@ int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { } int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); - int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I); + int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src); // TODO: Allow non-throughput costs that aren't binary. if (CostKind != TTI::TCK_RecipThroughput) @@ -974,13 +921,6 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return Cost; - } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { - // Floating point scalars are already located in index #0. - if (Index == 0) - return 0; - - return Cost; - } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { if (ST->hasP9Altivec()) { if (ISD == ISD::INSERT_VECTOR_ELT) @@ -1055,8 +995,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, LT.second == MVT::v4i32 || LT.second == MVT::v4f32); bool IsVSXType = ST->hasVSX() && (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); - bool IsQPXType = ST->hasQPX() && - (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); // VSX has 32b/64b load instructions. Legalization can handle loading of // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and @@ -1079,8 +1017,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // for Altivec types using the VSX instructions, but that's more expensive // than using the permutation-based load sequence. On the P8, that's no // longer true. - if (Opcode == Instruction::Load && - ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) && *Alignment >= LT.second.getScalarType().getStoreSize()) return Cost + LT.first; // Add the cost of the permutations. @@ -1133,7 +1070,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost( getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); - // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). For each result vector, we need one shuffle per incoming // vector (except that the first shuffle can take two incoming vectors diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index a3453b00a864a..d9aab298c7c16 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -106,7 +106,7 @@ class PPCTTIImpl : public BasicTTIImplBase { const Instruction *CxtI = nullptr); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, diff --git a/llvm/lib/Target/README.txt b/llvm/lib/Target/README.txt index 563aee9e1a780..a4876f715c64b 100644 --- a/llvm/lib/Target/README.txt +++ b/llvm/lib/Target/README.txt @@ -1541,9 +1541,9 @@ int bar() { return foo("abcd"); } //===---------------------------------------------------------------------===// -functionattrs doesn't know much about memcpy/memset. This function should be +function-attrs doesn't know much about memcpy/memset. This function should be marked readnone rather than readonly, since it only twiddles local memory, but -functionattrs doesn't handle memset/memcpy/memmove aggressively: +function-attrs doesn't handle memset/memcpy/memmove aggressively: struct X { int *p; int *q; }; int foo() { @@ -1557,7 +1557,7 @@ int foo() { } This can be seen at: -$ clang t.c -S -o - -mkernel -O0 -emit-llvm | opt -functionattrs -S +$ clang t.c -S -o - -mkernel -O0 -emit-llvm | opt -function-attrs -S //===---------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 407f980bd35e8..c094163c2f9d7 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -288,11 +288,21 @@ struct RISCVOperand : public MCParsedAsmOperand { SEW_1024, }; - enum class VLMUL { LMUL_1 = 0, LMUL_2, LMUL_4, LMUL_8 }; + enum class VLMUL { + LMUL_1 = 0, + LMUL_2, + LMUL_4, + LMUL_8, + LMUL_F8 = 5, + LMUL_F4, + LMUL_F2 + }; struct VTypeOp { VSEW Sew; VLMUL Lmul; + bool TailAgnostic; + bool MaskedoffAgnostic; unsigned Encoding; }; @@ -763,7 +773,7 @@ struct RISCVOperand : public MCParsedAsmOperand { case VSEW::SEW_1024: return "e1024"; } - return ""; + llvm_unreachable("Unknown SEW."); } static StringRef getLMULStr(VLMUL Lmul) { @@ -776,8 +786,14 @@ struct RISCVOperand : public MCParsedAsmOperand { return "m4"; case VLMUL::LMUL_8: return "m8"; + case VLMUL::LMUL_F2: + return "mf2"; + case VLMUL::LMUL_F4: + return "mf4"; + case VLMUL::LMUL_F8: + return "mf8"; } - return ""; + llvm_unreachable("Unknown LMUL."); } StringRef getVType(SmallString<32> &Buf) const { @@ -852,15 +868,31 @@ struct RISCVOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr createVType(APInt Sew, APInt Lmul, - SMLoc S, bool IsRV64) { + static std::unique_ptr + createVType(APInt Sew, APInt Lmul, bool Fractional, bool TailAgnostic, + bool MaskedoffAgnostic, SMLoc S, bool IsRV64) { auto Op = std::make_unique(KindTy::VType); Sew.ashrInPlace(3); unsigned SewLog2 = Sew.logBase2(); unsigned LmulLog2 = Lmul.logBase2(); Op->VType.Sew = static_cast(SewLog2); - Op->VType.Lmul = static_cast(LmulLog2); - Op->VType.Encoding = (SewLog2 << 2) | LmulLog2; + if (Fractional) { + unsigned Flmul = 8 - LmulLog2; + Op->VType.Lmul = static_cast(Flmul); + Op->VType.Encoding = + ((Flmul & 0x4) << 3) | ((SewLog2 & 0x7) << 2) | (Flmul & 0x3); + } else { + Op->VType.Lmul = static_cast(LmulLog2); + Op->VType.Encoding = (SewLog2 << 2) | LmulLog2; + } + if (TailAgnostic) { + Op->VType.Encoding |= 0x40; + } + if (MaskedoffAgnostic) { + Op->VType.Encoding |= 0x80; + } + Op->VType.TailAgnostic = TailAgnostic; + Op->VType.MaskedoffAgnostic = MaskedoffAgnostic; Op->StartLoc = S; Op->IsRV64 = IsRV64; return Op; @@ -1181,8 +1213,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } case Match_InvalidVTypeI: { SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, - "operand must be e[8|16|32|64|128|256|512|1024],m[1|2|4|8]"); + return Error( + ErrorLoc, + "operand must be " + "e[8|16|32|64|128|256|512|1024],m[1|2|4|8|f2|f4|f8],[ta|tu],[ma|mu]"); } case Match_InvalidVMaskRegister: { SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); @@ -1549,7 +1583,7 @@ OperandMatchResultTy RISCVAsmParser::parseVTypeI(OperandVector &Operands) { if (getLexer().getKind() != AsmToken::Identifier) return MatchOperand_NoMatch; - // Parse "e8,m1" + // Parse "e8,m1,t[a|u],m[a|u]" StringRef Name = getLexer().getTok().getIdentifier(); if (!Name.consume_front("e")) return MatchOperand_NoMatch; @@ -1559,13 +1593,6 @@ OperandMatchResultTy RISCVAsmParser::parseVTypeI(OperandVector &Operands) { return MatchOperand_NoMatch; getLexer().Lex(); - if (getLexer().getKind() == AsmToken::EndOfStatement) { - Operands.push_back( - RISCVOperand::createVType(Sew, APInt(16, 1), S, isRV64())); - - return MatchOperand_Success; - } - if (!getLexer().is(AsmToken::Comma)) return MatchOperand_NoMatch; getLexer().Lex(); @@ -1573,15 +1600,51 @@ OperandMatchResultTy RISCVAsmParser::parseVTypeI(OperandVector &Operands) { Name = getLexer().getTok().getIdentifier(); if (!Name.consume_front("m")) return MatchOperand_NoMatch; + // "m" or "mf" + bool Fractional = false; + if (Name.consume_front("f")) { + Fractional = true; + } APInt Lmul(16, Name, 10); if (Lmul != 1 && Lmul != 2 && Lmul != 4 && Lmul != 8) return MatchOperand_NoMatch; getLexer().Lex(); + if (!getLexer().is(AsmToken::Comma)) + return MatchOperand_NoMatch; + getLexer().Lex(); + + Name = getLexer().getTok().getIdentifier(); + // ta or tu + bool TailAgnostic; + if (Name.consume_front("ta")) + TailAgnostic = true; + else if (Name.consume_front("tu")) + TailAgnostic = false; + else + return MatchOperand_NoMatch; + getLexer().Lex(); + + if (!getLexer().is(AsmToken::Comma)) + return MatchOperand_NoMatch; + getLexer().Lex(); + + Name = getLexer().getTok().getIdentifier(); + // ma or mu + bool MaskedoffAgnostic; + if (Name.consume_front("ma")) + MaskedoffAgnostic = true; + else if (Name.consume_front("mu")) + MaskedoffAgnostic = false; + else + return MatchOperand_NoMatch; + getLexer().Lex(); + if (getLexer().getKind() != AsmToken::EndOfStatement) return MatchOperand_NoMatch; - Operands.push_back(RISCVOperand::createVType(Sew, Lmul, S, isRV64())); + Operands.push_back(RISCVOperand::createVType( + Sew, Lmul, Fractional, TailAgnostic, MaskedoffAgnostic, S, isRV64())); return MatchOperand_Success; } @@ -2281,71 +2344,41 @@ bool RISCVAsmParser::validateInstruction(MCInst &Inst, return false; unsigned DestReg = Inst.getOperand(0).getReg(); + unsigned CheckReg; // Operands[1] will be the first operand, DestReg. SMLoc Loc = Operands[1]->getStartLoc(); - if ((TargetFlags == RISCV::WidenV) || (TargetFlags == RISCV::WidenW) || - (TargetFlags == RISCV::SlideUp) || (TargetFlags == RISCV::Vrgather) || - (TargetFlags == RISCV::Vcompress)) { - if (TargetFlags != RISCV::WidenW) { - unsigned Src2Reg = Inst.getOperand(1).getReg(); - if (DestReg == Src2Reg) - return Error(Loc, "The destination vector register group cannot overlap" - " the source vector register group."); - if (TargetFlags == RISCV::WidenV) { - // Assume DestReg LMUL is 2 at least for widening/narrowing operations. - if (DestReg + 1 == Src2Reg) - return Error(Loc, - "The destination vector register group cannot overlap" - " the source vector register group."); - } - } - if (Inst.getOperand(2).isReg()) { - unsigned Src1Reg = Inst.getOperand(2).getReg(); - if (DestReg == Src1Reg) - return Error(Loc, "The destination vector register group cannot overlap" - " the source vector register group."); - if (TargetFlags == RISCV::WidenV || TargetFlags == RISCV::WidenW) { - // Assume DestReg LMUL is 2 at least for widening/narrowing operations. - if (DestReg + 1 == Src1Reg) - return Error(Loc, - "The destination vector register group cannot overlap" - " the source vector register group."); - } - } - if (Inst.getNumOperands() == 4) { - unsigned MaskReg = Inst.getOperand(3).getReg(); - - if (DestReg == MaskReg) - return Error(Loc, "The destination vector register group cannot overlap" - " the mask register."); - } - } else if (TargetFlags == RISCV::Narrow) { - unsigned Src2Reg = Inst.getOperand(1).getReg(); - if (DestReg == Src2Reg) + if (TargetFlags & RISCV::VS2Constraint) { + CheckReg = Inst.getOperand(1).getReg(); + if (DestReg == CheckReg) return Error(Loc, "The destination vector register group cannot overlap" " the source vector register group."); - // Assume Src2Reg LMUL is 2 at least for widening/narrowing operations. - if (DestReg == Src2Reg + 1) + } + if ((TargetFlags & RISCV::VS1Constraint) && (Inst.getOperand(2).isReg())) { + CheckReg = Inst.getOperand(2).getReg(); + if (DestReg == CheckReg) return Error(Loc, "The destination vector register group cannot overlap" " the source vector register group."); - } else if (TargetFlags == RISCV::WidenCvt || TargetFlags == RISCV::Iota) { - unsigned Src2Reg = Inst.getOperand(1).getReg(); - if (DestReg == Src2Reg) + } + if ((TargetFlags & RISCV::VMConstraint) && (DestReg == RISCV::V0)) { + // vadc, vsbc are special cases. These instructions have no mask register. + // The destination register could not be V0. + unsigned Opcode = Inst.getOpcode(); + if (Opcode == RISCV::VADC_VVM || Opcode == RISCV::VADC_VXM || + Opcode == RISCV::VADC_VIM || Opcode == RISCV::VSBC_VVM || + Opcode == RISCV::VSBC_VXM) + return Error(Loc, "The destination vector register group cannot be V0."); + + // Regardless masked or unmasked version, the number of operands is the + // same. For example, "viota.m v0, v2" is "viota.m v0, v2, NoRegister" + // actually. We need to check the last operand to ensure whether it is + // masked or not. + if ((TargetFlags & RISCV::OneInput) && (Inst.getNumOperands() == 3)) + CheckReg = Inst.getOperand(2).getReg(); + else if (Inst.getNumOperands() == 4) + CheckReg = Inst.getOperand(3).getReg(); + if (DestReg == CheckReg) return Error(Loc, "The destination vector register group cannot overlap" - " the source vector register group."); - if (TargetFlags == RISCV::WidenCvt) { - // Assume DestReg LMUL is 2 at least for widening/narrowing operations. - if (DestReg + 1 == Src2Reg) - return Error(Loc, "The destination vector register group cannot overlap" - " the source vector register group."); - } - if (Inst.getNumOperands() == 3) { - unsigned MaskReg = Inst.getOperand(2).getReg(); - - if (DestReg == MaskReg) - return Error(Loc, "The destination vector register group cannot overlap" - " the mask register."); - } + " the mask register."); } return false; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index eae3e13dbe409..2c7038a0a8ace 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -155,10 +155,28 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, unsigned Imm = MI->getOperand(OpNo).getImm(); unsigned Sew = (Imm >> 2) & 0x7; unsigned Lmul = Imm & 0x3; + bool Fractional = (Imm >> 5) & 0x1; - Lmul = 0x1 << Lmul; Sew = 0x1 << (Sew + 3); - O << "e" << Sew << ",m" << Lmul; + O << "e" << Sew; + if (Fractional) { + Lmul = 4 - Lmul; + Lmul = 0x1 << Lmul; + O << ",mf" << Lmul; + } else { + Lmul = 0x1 << Lmul; + O << ",m" << Lmul; + } + bool TailAgnostic = Imm & 0x40; + bool MaskedoffAgnostic = Imm & 0x80; + if (TailAgnostic) + O << ",ta"; + else + O << ",tu"; + if (MaskedoffAgnostic) + O << ",ma"; + else + O << ",mu"; } void RISCVInstPrinter::printVMaskReg(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index cad5f8e21185f..773a098690330 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1003,7 +1003,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::BITCAST: { assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && Subtarget.hasStdExtF() && "Unexpected custom legalisation"); - SDLoc DL(N); SDValue Op0 = N->getOperand(0); if (Op0.getValueType() != MVT::f32) return; diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index a47945a6a5154..21f0514146353 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -52,15 +52,37 @@ def InstFormatOther : InstFormat<17>; class RISCVVConstraint val> { bits<4> Value = val; } -def NoConstraint : RISCVVConstraint<0>; -def WidenV : RISCVVConstraint<1>; -def WidenW : RISCVVConstraint<2>; -def WidenCvt : RISCVVConstraint<3>; -def Narrow : RISCVVConstraint<4>; -def Iota : RISCVVConstraint<5>; -def SlideUp : RISCVVConstraint<6>; -def Vrgather : RISCVVConstraint<7>; -def Vcompress : RISCVVConstraint<8>; +def NoConstraint : RISCVVConstraint<0b0000>; +def VS2Constraint : RISCVVConstraint<0b0001>; +def VS1Constraint : RISCVVConstraint<0b0010>; +def VMConstraint : RISCVVConstraint<0b0100>; +def OneInput : RISCVVConstraint<0b1000>; + +def WidenV : RISCVVConstraint; +def WidenW : RISCVVConstraint; +def WidenCvt : RISCVVConstraint; +def Narrow : RISCVVConstraint; +def NarrowCvt : RISCVVConstraint; +def Vmadc : RISCVVConstraint; +def Iota : RISCVVConstraint; +def SlideUp : RISCVVConstraint; +def Vrgather : RISCVVConstraint; +def Vcompress : RISCVVConstraint; // The following opcode names match those given in Table 19.1 in the // RISC-V User-level ISA specification ("RISC-V base opcode map"). diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td index e5f154966ba61..8ca010d033c39 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td @@ -21,20 +21,17 @@ def OPIVX : RISCVVFormat<0b100>; def OPFVF : RISCVVFormat<0b101>; def OPMVX : RISCVVFormat<0b110>; -class RISCVMOP val> { - bits<3> Value = val; +class RISCVMOP val> { + bits<2> Value = val; } -def MOPLDUnitStrideU : RISCVMOP<0b000>; -def MOPLDStridedU : RISCVMOP<0b010>; -def MOPLDIndexedU : RISCVMOP<0b011>; -def MOPLDUnitStrideS : RISCVMOP<0b100>; -def MOPLDStridedS : RISCVMOP<0b110>; -def MOPLDIndexedS : RISCVMOP<0b111>; - -def MOPSTUnitStride : RISCVMOP<0b000>; -def MOPSTStrided : RISCVMOP<0b010>; -def MOPSTIndexedOrder: RISCVMOP<0b011>; -def MOPSTIndexedUnOrd: RISCVMOP<0b111>; +def MOPLDUnitStride : RISCVMOP<0b00>; +def MOPLDStrided : RISCVMOP<0b10>; +def MOPLDIndexed : RISCVMOP<0b11>; + +def MOPSTUnitStride : RISCVMOP<0b00>; +def MOPSTIndexedUnord : RISCVMOP<0b01>; +def MOPSTStrided : RISCVMOP<0b10>; +def MOPSTIndexedOrder : RISCVMOP<0b11>; class RISCVLSUMOP val> { bits<5> Value = val; @@ -45,13 +42,17 @@ def LUMOPUnitStrideFF: RISCVLSUMOP<0b10000>; def SUMOPUnitStride : RISCVLSUMOP<0b00000>; def SUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>; -class RISCVWidth val> { - bits<3> Value = val; +class RISCVWidth val> { + bits<4> Value = val; } -def LSWidthVByte : RISCVWidth<0b000>; -def LSWidthVHalf : RISCVWidth<0b101>; -def LSWidthVWord : RISCVWidth<0b110>; -def LSWidthVSEW : RISCVWidth<0b111>; +def LSWidth8 : RISCVWidth<0b0000>; +def LSWidth16 : RISCVWidth<0b0101>; +def LSWidth32 : RISCVWidth<0b0110>; +def LSWidth64 : RISCVWidth<0b0111>; +def LSWidth128 : RISCVWidth<0b1000>; +def LSWidth256 : RISCVWidth<0b1101>; +def LSWidth512 : RISCVWidth<0b1110>; +def LSWidth1024 : RISCVWidth<0b1111>; class RVInstSetVLi : RVInst { @@ -103,6 +104,7 @@ class RVInstVV funct6, RISCVVFormat opv, dag outs, dag ins, let Opcode = OPC_OP_V.Value; let Uses = [VTYPE, VL]; + let RVVConstraint = VMConstraint; } class RVInstVX funct6, RISCVVFormat opv, dag outs, dag ins, @@ -122,6 +124,7 @@ class RVInstVX funct6, RISCVVFormat opv, dag outs, dag ins, let Opcode = OPC_OP_V.Value; let Uses = [VTYPE, VL]; + let RVVConstraint = VMConstraint; } class RVInstV2 funct6, bits<5> vs2, RISCVVFormat opv, dag outs, dag ins, @@ -140,6 +143,7 @@ class RVInstV2 funct6, bits<5> vs2, RISCVVFormat opv, dag outs, dag ins, let Opcode = OPC_OP_V.Value; let Uses = [VTYPE, VL]; + let RVVConstraint = VMConstraint; } class RVInstIVI funct6, dag outs, dag ins, string opcodestr, @@ -159,6 +163,7 @@ class RVInstIVI funct6, dag outs, dag ins, string opcodestr, let Opcode = OPC_OP_V.Value; let Uses = [VTYPE, VL]; + let RVVConstraint = VMConstraint; } class RVInstV funct6, bits<5> vs1, RISCVVFormat opv, dag outs, @@ -177,10 +182,11 @@ class RVInstV funct6, bits<5> vs1, RISCVVFormat opv, dag outs, let Opcode = OPC_OP_V.Value; let Uses = [VTYPE, VL]; + let RVVConstraint = VMConstraint; } -class RVInstVLU nf, RISCVMOP mop, RISCVLSUMOP lumop, - RISCVWidth width, dag outs, dag ins, string opcodestr, +class RVInstVLU nf, bit mew, RISCVLSUMOP lumop, + bits<3> width, dag outs, dag ins, string opcodestr, string argstr> : RVInst { bits<5> rs1; @@ -188,18 +194,20 @@ class RVInstVLU nf, RISCVMOP mop, RISCVLSUMOP lumop, bit vm; let Inst{31-29} = nf; - let Inst{28-26} = mop.Value; + let Inst{28} = mew; + let Inst{27-26} = MOPLDUnitStride.Value; let Inst{25} = vm; let Inst{24-20} = lumop.Value; let Inst{19-15} = rs1; - let Inst{14-12} = width.Value; + let Inst{14-12} = width; let Inst{11-7} = vd; let Opcode = OPC_LOAD_FP.Value; let Uses = [VTYPE, VL]; + let RVVConstraint = VMConstraint; } -class RVInstVLS nf, RISCVMOP mop, RISCVWidth width, +class RVInstVLS nf, bit mew, bits<3> width, dag outs, dag ins, string opcodestr, string argstr> : RVInst { bits<5> rs2; @@ -208,18 +216,20 @@ class RVInstVLS nf, RISCVMOP mop, RISCVWidth width, bit vm; let Inst{31-29} = nf; - let Inst{28-26} = mop.Value; + let Inst{28} = mew; + let Inst{27-26} = MOPLDStrided.Value; let Inst{25} = vm; let Inst{24-20} = rs2; let Inst{19-15} = rs1; - let Inst{14-12} = width.Value; + let Inst{14-12} = width; let Inst{11-7} = vd; let Opcode = OPC_LOAD_FP.Value; let Uses = [VTYPE, VL]; + let RVVConstraint = VMConstraint; } -class RVInstVLX nf, RISCVMOP mop, RISCVWidth width, +class RVInstVLX nf, bit mew, bits<3> width, dag outs, dag ins, string opcodestr, string argstr> : RVInst { bits<5> vs2; @@ -228,19 +238,21 @@ class RVInstVLX nf, RISCVMOP mop, RISCVWidth width, bit vm; let Inst{31-29} = nf; - let Inst{28-26} = mop.Value; + let Inst{28} = mew; + let Inst{27-26} = MOPLDIndexed.Value; let Inst{25} = vm; let Inst{24-20} = vs2; let Inst{19-15} = rs1; - let Inst{14-12} = width.Value; + let Inst{14-12} = width; let Inst{11-7} = vd; let Opcode = OPC_LOAD_FP.Value; let Uses = [VTYPE, VL]; + let RVVConstraint = VMConstraint; } -class RVInstVSU nf, RISCVMOP mop, RISCVLSUMOP sumop, - RISCVWidth width, dag outs, dag ins, string opcodestr, +class RVInstVSU nf, bit mew, RISCVLSUMOP sumop, + bits<3> width, dag outs, dag ins, string opcodestr, string argstr> : RVInst { bits<5> rs1; @@ -248,18 +260,19 @@ class RVInstVSU nf, RISCVMOP mop, RISCVLSUMOP sumop, bit vm; let Inst{31-29} = nf; - let Inst{28-26} = mop.Value; + let Inst{28} = mew; + let Inst{27-26} = MOPSTUnitStride.Value; let Inst{25} = vm; let Inst{24-20} = sumop.Value; let Inst{19-15} = rs1; - let Inst{14-12} = width.Value; + let Inst{14-12} = width; let Inst{11-7} = vs3; let Opcode = OPC_STORE_FP.Value; let Uses = [VTYPE, VL]; } -class RVInstVSS nf, RISCVMOP mop, RISCVWidth width, +class RVInstVSS nf, bit mew, bits<3> width, dag outs, dag ins, string opcodestr, string argstr> : RVInst { bits<5> rs2; @@ -268,18 +281,19 @@ class RVInstVSS nf, RISCVMOP mop, RISCVWidth width, bit vm; let Inst{31-29} = nf; - let Inst{28-26} = mop.Value; + let Inst{28} = mew; + let Inst{27-26} = MOPSTStrided.Value; let Inst{25} = vm; let Inst{24-20} = rs2; let Inst{19-15} = rs1; - let Inst{14-12} = width.Value; + let Inst{14-12} = width; let Inst{11-7} = vs3; let Opcode = OPC_STORE_FP.Value; let Uses = [VTYPE, VL]; } -class RVInstVSX nf, RISCVMOP mop, RISCVWidth width, +class RVInstVSX nf, bit mew, RISCVMOP mop, bits<3> width, dag outs, dag ins, string opcodestr, string argstr> : RVInst { bits<5> vs2; @@ -288,11 +302,12 @@ class RVInstVSX nf, RISCVMOP mop, RISCVWidth width, bit vm; let Inst{31-29} = nf; - let Inst{28-26} = mop.Value; + let Inst{28} = mew; + let Inst{27-26} = mop.Value; let Inst{25} = vm; let Inst{24-20} = vs2; let Inst{19-15} = rs1; - let Inst{14-12} = width.Value; + let Inst{14-12} = width; let Inst{11-7} = vs3; let Opcode = OPC_STORE_FP.Value; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 21bc508cdc9ca..cd8b6d5fba59c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -138,14 +138,53 @@ namespace RISCV { // Match with the definitions in RISCVInstrFormatsV.td enum RVVConstraintType { NoConstraint = 0, - WidenV = 1, - WidenW = 2, - WidenCvt = 3, - Narrow = 4, - Iota = 5, - SlideUp = 6, - Vrgather = 7, - Vcompress = 8, + VS2Constraint = 0b0001, + VS1Constraint = 0b0010, + VMConstraint = 0b0100, + OneInput = 0b1000, + + // Illegal instructions: + // + // * The destination vector register group for a masked vector instruction + // cannot overlap the source mask register (v0), unless the destination vector + // register is being written with a mask value (e.g., comparisons) or the + // scalar result of a reduction. + // + // * Widening: The destination vector register group cannot overlap a source + // vector register group of a different EEW + // + // * Narrowing: The destination vector register group cannot overlap the + // first source vector register group + // + // * For vadc and vsbc, an illegal instruction exception is raised if the + // destination vector register is v0. + // + // * For vmadc and vmsbc, an illegal instruction exception is raised if the + // destination vector register overlaps a source vector register group. + // + // * viota: An illegal instruction exception is raised if the destination + // vector register group overlaps the source vector mask register. If the + // instruction is masked, an illegal instruction exception is issued if the + // destination vector register group overlaps v0. + // + // * v[f]slide[1]up: The destination vector register group for vslideup cannot + // overlap the source vector register group. + // + // * vrgather: The destination vector register group cannot overlap with the + // source vector register groups. + // + // * vcompress: The destination vector register group cannot overlap the + // source vector register group or the source mask register + WidenV = VS2Constraint | VS1Constraint | VMConstraint, + WidenW = VS1Constraint | VMConstraint, + WidenCvt = VS2Constraint | VMConstraint | OneInput, + Narrow = VS2Constraint | VMConstraint, + NarrowCvt = VS2Constraint | VMConstraint | OneInput, + Vmadc = VS2Constraint | VS1Constraint, + Iota = VS2Constraint | VMConstraint | OneInput, + SlideUp = VS2Constraint | VMConstraint, + Vrgather = VS2Constraint | VS1Constraint | VMConstraint, + Vcompress = VS2Constraint | VS1Constraint, ConstraintOffset = 5, ConstraintMask = 0b1111 diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 1c7f53fecb8c9..ae335ddce882a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// /// /// This file describes the RISC-V instructions from the standard 'V' Vector -/// extension, version 0.8. +/// extension, version 0.9. /// This version is still experimental as the 'V' extension hasn't been /// ratified yet. /// @@ -96,27 +96,30 @@ def simm5_plus1 : Operand, ImmLeaf - : RVInstVLU<0b000, mop, lumop, width, (outs VRegOp:$vd), +class VUnitStrideLoad + : RVInstVLU<0b000, width.Value{3}, lumop, width.Value{2-0}, + (outs VRegOp:$vd), (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">; // load vd, (rs1), rs2, vm -class VStridedLoad - : RVInstVLS<0b000, mop, width, (outs VRegOp:$vd), +class VStridedLoad + : RVInstVLS<0b000, width.Value{3}, width.Value{2-0}, + (outs VRegOp:$vd), (ins GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr, "$vd, (${rs1}), $rs2$vm">; // load vd, (rs1), vs2, vm -class VIndexedLoad - : RVInstVLX<0b000, mop, width, (outs VRegOp:$vd), +class VIndexedLoad + : RVInstVLX<0b000, width.Value{3}, width.Value{2-0}, + (outs VRegOp:$vd), (ins GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm), opcodestr, "$vd, (${rs1}), $vs2$vm">; // vlr.v vd, (rs1) class VWholeLoad nf, string opcodestr> - : RVInstVLU { let vm = 1; let Uses = []; @@ -125,28 +128,28 @@ class VWholeLoad nf, string opcodestr> let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { // store vd, vs3, (rs1), vm -class VUnitStrideStore - : RVInstVSU<0b000, mop, sumop, width, (outs), - (ins VRegOp:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr, + : RVInstVSU<0b000, width.Value{3}, sumop, width.Value{2-0}, + (outs), (ins VRegOp:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr, "$vs3, (${rs1})$vm">; // store vd, vs3, (rs1), rs2, vm -class VStridedStore - : RVInstVSS<0b000, mop, width, (outs), +class VStridedStore + : RVInstVSS<0b000, width.Value{3}, width.Value{2-0}, (outs), (ins VRegOp:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr, "$vs3, (${rs1}), $rs2$vm">; // store vd, vs3, (rs1), vs2, vm class VIndexedStore - : RVInstVSX<0b000, mop, width, (outs), + : RVInstVSX<0b000, width.Value{3}, mop, width.Value{2-0}, (outs), (ins VRegOp:$vs3, GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm), opcodestr, "$vs3, (${rs1}), $vs2$vm">; // vsr.v vd, (rs1) class VWholeStore nf, string opcodestr> - : RVInstVSU { let vm = 1; let Uses = []; @@ -372,68 +375,79 @@ def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2), } // hasSideEffects = 1, mayLoad = 0, mayStore = 0 // Vector Unit-Stride Instructions -def VLB_V : VUnitStrideLoad; -def VLH_V : VUnitStrideLoad; -def VLW_V : VUnitStrideLoad; - -def VLBU_V : VUnitStrideLoad; -def VLHU_V : VUnitStrideLoad; -def VLWU_V : VUnitStrideLoad; - -def VLE_V : VUnitStrideLoad; - -def VLBFF_V : VUnitStrideLoad; -def VLHFF_V : VUnitStrideLoad; -def VLWFF_V : VUnitStrideLoad; - -def VLBUFF_V : VUnitStrideLoad; -def VLHUFF_V : VUnitStrideLoad; -def VLWUFF_V : VUnitStrideLoad; - -def VLEFF_V : VUnitStrideLoad; - -def VSB_V : VUnitStrideStore; -def VSH_V : VUnitStrideStore; -def VSW_V : VUnitStrideStore; - -def VSE_V : VUnitStrideStore; +def VLE8_V : VUnitStrideLoad; +def VLE16_V : VUnitStrideLoad; +def VLE32_V : VUnitStrideLoad; +def VLE64_V : VUnitStrideLoad; +def VLE128_V : VUnitStrideLoad; +def VLE256_V : VUnitStrideLoad; +def VLE512_V : VUnitStrideLoad; +def VLE1024_V : VUnitStrideLoad; + +def VLE8FF_V : VUnitStrideLoad; +def VLE16FF_V : VUnitStrideLoad; +def VLE32FF_V : VUnitStrideLoad; +def VLE64FF_V : VUnitStrideLoad; +def VLE128FF_V : VUnitStrideLoad; +def VLE256FF_V : VUnitStrideLoad; +def VLE512FF_V : VUnitStrideLoad; +def VLE1024FF_V : VUnitStrideLoad; + +def VSE8_V : VUnitStrideStore; +def VSE16_V : VUnitStrideStore; +def VSE32_V : VUnitStrideStore; +def VSE64_V : VUnitStrideStore; +def VSE128_V : VUnitStrideStore; +def VSE256_V : VUnitStrideStore; +def VSE512_V : VUnitStrideStore; +def VSE1024_V : VUnitStrideStore; // Vector Strided Instructions -def VLSB_V : VStridedLoad; -def VLSH_V : VStridedLoad; -def VLSW_V : VStridedLoad; - -def VLSBU_V : VStridedLoad; -def VLSHU_V : VStridedLoad; -def VLSWU_V : VStridedLoad; - -def VLSE_V : VStridedLoad; - -def VSSB_V : VStridedStore; -def VSSH_V : VStridedStore; -def VSSW_V : VStridedStore; -def VSSE_V : VStridedStore; +def VLSE8_V : VStridedLoad; +def VLSE16_V : VStridedLoad; +def VLSE32_V : VStridedLoad; +def VLSE64_V : VStridedLoad; +def VLSE128_V : VStridedLoad; +def VLSE256_V : VStridedLoad; +def VLSE512_V : VStridedLoad; +def VLSE1024_V : VStridedLoad; + +def VSSE8_V : VStridedStore; +def VSSE16_V : VStridedStore; +def VSSE32_V : VStridedStore; +def VSSE64_V : VStridedStore; +def VSSE128_V : VStridedStore; +def VSSE256_V : VStridedStore; +def VSSE512_V : VStridedStore; +def VSSE1024_V : VStridedStore; // Vector Indexed Instructions -def VLXB_V : VIndexedLoad; -def VLXH_V : VIndexedLoad; -def VLXW_V : VIndexedLoad; - -def VLXBU_V : VIndexedLoad; -def VLXHU_V : VIndexedLoad; -def VLXWU_V : VIndexedLoad; - -def VLXE_V : VIndexedLoad; - -def VSXB_V : VIndexedStore; -def VSXH_V : VIndexedStore; -def VSXW_V : VIndexedStore; -def VSXE_V : VIndexedStore; - -def VSUXB_V : VIndexedStore; -def VSUXH_V : VIndexedStore; -def VSUXW_V : VIndexedStore; -def VSUXE_V : VIndexedStore; +def VLXEI8_V : VIndexedLoad; +def VLXEI16_V : VIndexedLoad; +def VLXEI32_V : VIndexedLoad; +def VLXEI64_V : VIndexedLoad; +def VLXEI128_V : VIndexedLoad; +def VLXEI256_V : VIndexedLoad; +def VLXEI512_V : VIndexedLoad; +def VLXEI1024_V : VIndexedLoad; + +def VSXEI8_V : VIndexedStore; +def VSXEI16_V : VIndexedStore; +def VSXEI32_V : VIndexedStore; +def VSXEI64_V : VIndexedStore; +def VSXEI128_V : VIndexedStore; +def VSXEI256_V : VIndexedStore; +def VSXEI512_V : VIndexedStore; +def VSXEI1024_V : VIndexedStore; + +def VSUXEI8_V : VIndexedStore; +def VSUXEI16_V : VIndexedStore; +def VSUXEI32_V : VIndexedStore; +def VSUXEI64_V : VIndexedStore; +def VSUXEI128_V : VIndexedStore; +def VSUXEI256_V : VIndexedStore; +def VSUXEI512_V : VIndexedStore; +def VSUXEI1024_V : VIndexedStore; def VL1R_V : VWholeLoad<0, "vl1r.v">; def VS1R_V : VWholeStore<0, "vs1r.v">; @@ -472,13 +486,25 @@ def : InstAlias<"vwcvt.x.x.v $vd, $vs$vm", def : InstAlias<"vwcvtu.x.x.v $vd, $vs$vm", (VWADDU_VX VRegOp:$vd, VRegOp:$vs, X0, VMaskOp:$vm)>; +// Vector Integer Extension +defm VZEXT_VF8 : VALU_MV_VS2<"vzext.vf8", 0b010010, 0b00010>; +defm VSEXT_VF8 : VALU_MV_VS2<"vsext.vf8", 0b010010, 0b00011>; +defm VZEXT_VF4 : VALU_MV_VS2<"vzext.vf4", 0b010010, 0b00100>; +defm VSEXT_VF4 : VALU_MV_VS2<"vsext.vf4", 0b010010, 0b00101>; +defm VZEXT_VF2 : VALU_MV_VS2<"vzext.vf2", 0b010010, 0b00110>; +defm VSEXT_VF2 : VALU_MV_VS2<"vsext.vf2", 0b010010, 0b00111>; + // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions defm VADC_V : VALUm_IV_V_X_I<"vadc", 0b010000>; +let Constraints = "@earlyclobber $vd", RVVConstraint = Vmadc in { defm VMADC_V : VALUm_IV_V_X_I<"vmadc", 0b010001>; defm VMADC_V : VALUNoVm_IV_V_X_I<"vmadc", 0b010001>; +} // Constraints = "@earlyclobber $vd", RVVConstraint = Vmadc defm VSBC_V : VALUm_IV_V_X<"vsbc", 0b010010>; +let Constraints = "@earlyclobber $vd", RVVConstraint = Vmadc in { defm VMSBC_V : VALUm_IV_V_X<"vmsbc", 0b010011>; defm VMSBC_V : VALUNoVm_IV_V_X<"vmsbc", 0b010011>; +} // Constraints = "@earlyclobber $vd", RVVConstraint = Vmadc // Vector Bitwise Logical Instructions defm VAND_V : VALU_IV_V_X_I<"vand", 0b001001>; @@ -504,6 +530,7 @@ defm VNSRA_W : VALU_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">; } // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow // Vector Integer Comparison Instructions +let RVVConstraint = NoConstraint in { defm VMSEQ_V : VALU_IV_V_X_I<"vmseq", 0b011000>; defm VMSNE_V : VALU_IV_V_X_I<"vmsne", 0b011001>; defm VMSLTU_V : VALU_IV_V_X<"vmsltu", 0b011010>; @@ -512,6 +539,7 @@ defm VMSLEU_V : VALU_IV_V_X_I<"vmsleu", 0b011100>; defm VMSLE_V : VALU_IV_V_X_I<"vmsle", 0b011101>; defm VMSGTU_V : VALU_IV_X_I<"vmsgtu", 0b011110>; defm VMSGT_V : VALU_IV_X_I<"vmsgt", 0b011111>; +} // RVVConstraint = NoConstraint def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm", (VMSLTU_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>; @@ -664,7 +692,7 @@ defm VFWNMSAC_V : VALUr_FV_V_F<"vfwnmsac", 0b111111>; } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV // Vector Floating-Point Square-Root Instruction -defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b100011, 0b00000>; +defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>; // Vector Floating-Point MIN/MAX Instructions defm VFMIN_V : VALU_FV_V_F<"vfmin", 0b000100>; @@ -676,12 +704,14 @@ defm VFSGNJN_V : VALU_FV_V_F<"vfsgnjn", 0b001001>; defm VFSGNJX_V : VALU_FV_V_F<"vfsgnjx", 0b001010>; // Vector Floating-Point Compare Instructions +let RVVConstraint = NoConstraint in { defm VMFEQ_V : VALU_FV_V_F<"vmfeq", 0b011000>; defm VMFNE_V : VALU_FV_V_F<"vmfne", 0b011100>; defm VMFLT_V : VALU_FV_V_F<"vmflt", 0b011011>; defm VMFLE_V : VALU_FV_V_F<"vmfle", 0b011001>; defm VMFGT_V : VALU_FV_F<"vmfgt", 0b011101>; defm VMFGE_V : VALU_FV_F<"vmfge", 0b011111>; +} // RVVConstraint = NoConstraint def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm", (VMFLT_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>; @@ -689,7 +719,7 @@ def : InstAlias<"vmfge.vv $vd, $va, $vb$vm", (VMFLE_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>; // Vector Floating-Point Classify Instruction -defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b100011, 0b10000>; +defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b010011, 0b10000>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { // Vector Floating-Point Merge Instruction @@ -708,31 +738,38 @@ def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd), } // hasSideEffects = 0, mayLoad = 0, mayStore = 0 // Single-Width Floating-Point/Integer Type-Convert Instructions -defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b100010, 0b00000>; -defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b100010, 0b00001>; -defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b100010, 0b00010>; -defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b100010, 0b00011>; +defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>; +defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>; +defm VFCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>; +defm VFCVT_RTZ_X_F_V : VALU_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>; +defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>; +defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>; // Widening Floating-Point/Integer Type-Convert Instructions let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt in { -defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b100010, 0b01000>; -defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b100010, 0b01001>; -defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b100010, 0b01010>; -defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b100010, 0b01011>; -defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b100010, 0b01100>; +defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>; +defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>; +defm VFWCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>; +defm VFWCVT_RTZ_X_F_V : VALU_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>; +defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>; +defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>; +defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>; } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt // Narrowing Floating-Point/Integer Type-Convert Instructions -let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in { -defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b100010, 0b10000>; -defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b100010, 0b10001>; -defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b100010, 0b10010>; -defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b100010, 0b10011>; -defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b100010, 0b10100>; -defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b100010, 0b10101>; -} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow +let Constraints = "@earlyclobber $vd", RVVConstraint = NarrowCvt in { +defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>; +defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>; +defm VFNCVT_RTZ_XU_F_W : VALU_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>; +defm VFNCVT_RTZ_X_F_W : VALU_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>; +defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>; +defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>; +defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>; +defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>; +} // Constraints = "@earlyclobber $vd", RVVConstraint = NarrowCvt // Vector Single-Width Integer Reduction Instructions +let RVVConstraint = NoConstraint in { defm VREDSUM : VALU_MV_V<"vredsum", 0b000000>; defm VREDMAXU : VALU_MV_V<"vredmaxu", 0b000110>; defm VREDMAX : VALU_MV_V<"vredmax", 0b000111>; @@ -741,32 +778,35 @@ defm VREDMIN : VALU_MV_V<"vredmin", 0b000101>; defm VREDAND : VALU_MV_V<"vredand", 0b000001>; defm VREDOR : VALU_MV_V<"vredor", 0b000010>; defm VREDXOR : VALU_MV_V<"vredxor", 0b000011>; +} // RVVConstraint = NoConstraint // Vector Widening Integer Reduction Instructions -let Constraints = "@earlyclobber $vd" in { +let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in { // Set earlyclobber for following instructions for second and mask operands. // This has the downside that the earlyclobber constraint is too coarse and // will impose unnecessary restrictions by not allowing the destination to // overlap with the first (wide) operand. defm VWREDSUMU : VALU_IV_V<"vwredsumu", 0b110000>; defm VWREDSUM : VALU_IV_V<"vwredsum", 0b110001>; -} // Constraints = "@earlyclobber $vd" +} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint // Vector Single-Width Floating-Point Reduction Instructions +let RVVConstraint = NoConstraint in { defm VFREDOSUM : VALU_FV_V<"vfredosum", 0b000011>; defm VFREDSUM : VALU_FV_V<"vfredsum", 0b000001>; defm VFREDMAX : VALU_FV_V<"vfredmax", 0b000111>; defm VFREDMIN : VALU_FV_V<"vfredmin", 0b000101>; +} // RVVConstraint = NoConstraint // Vector Widening Floating-Point Reduction Instructions -let Constraints = "@earlyclobber $vd" in { +let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in { // Set earlyclobber for following instructions for second and mask operands. // This has the downside that the earlyclobber constraint is too coarse and // will impose unnecessary restrictions by not allowing the destination to // overlap with the first (wide) operand. defm VFWREDOSUM : VALU_FV_V<"vfwredosum", 0b110011>; defm VFWREDSUM : VALU_FV_V<"vfwredsum", 0b110001>; -} // Constraints = "@earlyclobber $vd" +} // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint // Vector Mask-Register Logical Instructions defm VMAND_M : VALU_MV_Mask<"vmand", 0b011001, "m">; @@ -778,7 +818,7 @@ defm VMNOR_M : VALU_MV_Mask<"vmnor", 0b011110, "m">; defm VMORNOT_M : VALU_MV_Mask<"vmornot", 0b011100, "m">; defm VMXNOR_M : VALU_MV_Mask<"vmxnor", 0b011111, "m">; -def : InstAlias<"vmcpy.m $vd, $vs", +def : InstAlias<"vmmv.m $vd, $vs", (VMAND_MM VRegOp:$vd, VRegOp:$vs, VRegOp:$vs)>; def : InstAlias<"vmclr.m $vd", (VMXOR_MM VRegOp:$vd, VRegOp:$vd, VRegOp:$vd)>; @@ -847,8 +887,10 @@ defm VSLIDEDOWN_V : VALU_IV_X_I<"vslidedown", 0b001111, uimm5>; let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in { defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>; +defm VFSLIDE1UP_V : VALU_FV_F<"vfslide1up", 0b001110>; } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>; +defm VFSLIDE1DOWN_V : VALU_FV_F<"vfslide1down", 0b001111>; // Vector Register Gather Instruction let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in { diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index 83c44e0682cef..5a9ecfe74ecc8 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -255,12 +255,6 @@ namespace { } } - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - // FIXME. - return false; - } - /// fixupNeedsRelaxation - Target specific predicate for whether a given /// fixup requires the associated instruction to be relaxed. bool fixupNeedsRelaxation(const MCFixup &Fixup, diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index e62f5040898f0..5f276f7935784 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -54,10 +54,6 @@ class SystemZMCAsmBackend : public MCAsmBackend { const MCValue &Target, MutableArrayRef Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const override; - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - return false; - } bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *Fragment, const MCAsmLayout &Layout) const override { diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 864200e5f71cc..8758ddee0aabf 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -699,11 +699,12 @@ getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, } int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { // FIXME: Can the logic below also be used for these cost kinds? if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) { - int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I); + int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); return BaseCost == 0 ? BaseCost : 1; } @@ -786,8 +787,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. Base implementation does not // realize float->int gets scalarized. - unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(), - Src->getScalarType(), CostKind); + unsigned ScalarCost = getCastInstrCost( + Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind); unsigned TotCost = VF * ScalarCost; bool NeedsInserts = true, NeedsExtracts = true; // FP128 registers do not get inserted or extracted. @@ -828,7 +829,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, } } - return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I); + return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); } // Scalar i8 / i16 operations will typically be made after first extending diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 7f8f7f6f923ff..1aa31ff1690c6 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -93,7 +93,7 @@ class SystemZTTIImpl : public BasicTTIImplBase { unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/VE/VECallingConv.td b/llvm/lib/Target/VE/VECallingConv.td index 4f04dae884ab5..acdae68323fdd 100644 --- a/llvm/lib/Target/VE/VECallingConv.td +++ b/llvm/lib/Target/VE/VECallingConv.td @@ -14,13 +14,6 @@ // Aurora VE //===----------------------------------------------------------------------===// def CC_VE_C_Stack: CallingConv<[ - // float --> need special handling like below. - // 0 4 - // +------+------+ - // | empty| float| - // +------+------+ - CCIfType<[f32], CCCustom<"allocateFloat">>, - // All of the rest are assigned to the stack in 8-byte aligned units. CCAssignToStack<0, 8> ]>; @@ -28,20 +21,18 @@ def CC_VE_C_Stack: CallingConv<[ def CC_VE : CallingConv<[ // All arguments get passed in generic registers if there is space. - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + // Promote i1/i8/i16/i32 arguments to i64. + CCIfType<[i1, i8, i16, i32], CCPromoteToType>, - // bool, char, int, enum, long --> generic integer 32 bit registers - CCIfType<[i32], CCAssignToRegWithShadow< - [SW0, SW1, SW2, SW3, SW4, SW5, SW6, SW7], - [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>, - - // float --> generic floating point 32 bit registers - CCIfType<[f32], CCAssignToRegWithShadow< - [SF0, SF1, SF2, SF3, SF4, SF5, SF6, SF7], - [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>, + // Convert float arguments to i64 with padding. + // 63 31 0 + // +------+------+ + // | float| 0 | + // +------+------+ + CCIfType<[f32], CCBitConvertToType>, - // long long/double --> generic 64 bit registers + // bool, char, int, enum, long, long long, float, double + // --> generic 64 bit registers CCIfType<[i64, f64], CCAssignToReg<[SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>, @@ -52,31 +43,32 @@ def CC_VE : CallingConv<[ // All arguments get passed in stack for varargs function or non-prototyped // function. def CC_VE2 : CallingConv<[ - // float --> need special handling like below. - // 0 4 + // Promote i1/i8/i16/i32 arguments to i64. + CCIfType<[i1, i8, i16, i32], CCPromoteToType>, + + // Convert float arguments to i64 with padding. + // 63 31 0 // +------+------+ - // | empty| float| + // | float| 0 | // +------+------+ - CCIfType<[f32], CCCustom<"allocateFloat">>, + CCIfType<[f32], CCBitConvertToType>, CCAssignToStack<0, 8> ]>; def RetCC_VE : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, - - // bool, char, int, enum, long --> generic integer 32 bit registers - CCIfType<[i32], CCAssignToRegWithShadow< - [SW0, SW1, SW2, SW3, SW4, SW5, SW6, SW7], - [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>, + // Promote i1/i8/i16/i32 return values to i64. + CCIfType<[i1, i8, i16, i32], CCPromoteToType>, - // float --> generic floating point 32 bit registers - CCIfType<[f32], CCAssignToRegWithShadow< - [SF0, SF1, SF2, SF3, SF4, SF5, SF6, SF7], - [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>, + // Convert float return values to i64 with padding. + // 63 31 0 + // +------+------+ + // | float| 0 | + // +------+------+ + CCIfType<[f32], CCBitConvertToType>, - // long long/double --> generic 64 bit registers + // bool, char, int, enum, long, long long, float, double + // --> generic 64 bit registers CCIfType<[i64, f64], CCAssignToReg<[SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>, ]>; diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index ab720545dd831..e2232f4500e35 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -38,28 +38,6 @@ using namespace llvm; // Calling Convention Implementation //===----------------------------------------------------------------------===// -static bool allocateFloat(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - switch (LocVT.SimpleTy) { - case MVT::f32: { - // Allocate stack like below - // 0 4 - // +------+------+ - // | empty| float| - // +------+------+ - // Use align=8 for dummy area to align the beginning of these 2 area. - State.AllocateStack(4, Align(8)); // for empty area - // Use align=4 for value to place it at just after the dummy area. - unsigned Offset = State.AllocateStack(4, Align(4)); // for float value area - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - return true; - } - default: - return false; - } -} - #include "VEGenCallingConv.inc" bool VETargetLowering::CanLowerReturn( @@ -109,6 +87,22 @@ VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, case CCValAssign::AExt: OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal); break; + case CCValAssign::BCvt: { + // Convert a float return value to i64 with padding. + // 63 31 0 + // +------+------+ + // | float| 0 | + // +------+------+ + assert(VA.getLocVT() == MVT::i64); + assert(VA.getValVT() == MVT::f32); + SDValue Undef = SDValue( + DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0); + SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32); + OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, + MVT::i64, Undef, OutVal, Sub_f32), + 0); + break; + } default: llvm_unreachable("Unknown loc info!"); } @@ -179,6 +173,20 @@ SDValue VETargetLowering::LowerFormalArguments( Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg, DAG.getValueType(VA.getValVT())); break; + case CCValAssign::BCvt: { + // Extract a float argument from i64 with padding. + // 63 31 0 + // +------+------+ + // | float| 0 | + // +------+------+ + assert(VA.getLocVT() == MVT::i64); + assert(VA.getValVT() == MVT::f32); + SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32); + Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, + MVT::f32, Arg, Sub_f32), + 0); + break; + } default: break; } @@ -197,6 +205,20 @@ SDValue VETargetLowering::LowerFormalArguments( // beginning of the arguments area at %fp+176. unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset; unsigned ValSize = VA.getValVT().getSizeInBits() / 8; + + // Adjust offset for a float argument by adding 4 since the argument is + // stored in 8 bytes buffer with offset like below. LLVM generates + // 4 bytes load instruction, so need to adjust offset here. This + // adjustment is required in only LowerFormalArguments. In LowerCall, + // a float argument is converted to i64 first, and stored as 8 bytes + // data, which is required by ABI, so no need for adjustment. + // 0 4 + // +------+------+ + // | empty| float| + // +------+------+ + if (VA.getValVT() == MVT::f32) + Offset += 4; + int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true); InVals.push_back( DAG.getLoad(VA.getValVT(), DL, Chain, @@ -371,6 +393,22 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, case CCValAssign::AExt: Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; + case CCValAssign::BCvt: { + // Convert a float argument to i64 with padding. + // 63 31 0 + // +------+------+ + // | float| 0 | + // +------+------+ + assert(VA.getLocVT() == MVT::i64); + assert(VA.getValVT() == MVT::f32); + SDValue Undef = SDValue( + DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0); + SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32); + Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, + MVT::i64, Undef, Arg, Sub_f32), + 0); + break; + } } if (VA.isRegLoc()) { @@ -488,6 +526,20 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV, DAG.getValueType(VA.getValVT())); break; + case CCValAssign::BCvt: { + // Extract a float return value from i64 with padding. + // 63 31 0 + // +------+------+ + // | float| 0 | + // +------+------+ + assert(VA.getLocVT() == MVT::i64); + assert(VA.getValVT() == MVT::f32); + SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32); + RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, + MVT::f32, RV, Sub_f32), + 0); + break; + } default: break; } diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt index b730172715d93..3cbc21c4cbe2d 100644 --- a/llvm/lib/Target/WebAssembly/CMakeLists.txt +++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt @@ -48,6 +48,7 @@ add_llvm_target(WebAssemblyCodeGen WebAssemblyRuntimeLibcallSignatures.cpp WebAssemblySelectionDAGInfo.cpp WebAssemblySetP2AlignOperands.cpp + WebAssemblySortRegion.cpp WebAssemblyMemIntrinsicResults.cpp WebAssemblySubtarget.cpp WebAssemblyTargetMachine.cpp diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index 8ecd7c53621df..d88311197c1ad 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -59,11 +59,6 @@ class WebAssemblyAsmBackend final : public MCAsmBackend { return false; } - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - return false; - } - bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 02b310628ee17..631e96dd9246b 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -232,6 +232,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) { WASM_LOAD_STORE(ATOMIC_NOTIFY) WASM_LOAD_STORE(ATOMIC_WAIT_I32) WASM_LOAD_STORE(LOAD_SPLAT_v32x4) + WASM_LOAD_STORE(LOAD_ZERO_v4i32) return 2; WASM_LOAD_STORE(LOAD_I64) WASM_LOAD_STORE(LOAD_F64) @@ -254,6 +255,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) { WASM_LOAD_STORE(LOAD_EXTEND_U_v4i32) WASM_LOAD_STORE(LOAD_EXTEND_S_v2i64) WASM_LOAD_STORE(LOAD_EXTEND_U_v2i64) + WASM_LOAD_STORE(LOAD_ZERO_v2i64) return 3; WASM_LOAD_STORE(LOAD_V128) WASM_LOAD_STORE(STORE_V128) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp index 8442b49e25f45..eb3e9b91d40df 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp @@ -19,6 +19,7 @@ #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" #include "WebAssemblyExceptionInfo.h" +#include "WebAssemblySortRegion.h" #include "WebAssemblySubtarget.h" #include "WebAssemblyUtilities.h" #include "llvm/ADT/PriorityQueue.h" @@ -31,6 +32,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +using WebAssembly::SortRegion; +using WebAssembly::SortRegionInfo; #define DEBUG_TYPE "wasm-cfg-sort" @@ -44,78 +47,6 @@ static cl::opt WasmDisableEHPadSort( namespace { -// Wrapper for loops and exceptions -class Region { -public: - virtual ~Region() = default; - virtual MachineBasicBlock *getHeader() const = 0; - virtual bool contains(const MachineBasicBlock *MBB) const = 0; - virtual unsigned getNumBlocks() const = 0; - using block_iterator = typename ArrayRef::const_iterator; - virtual iterator_range blocks() const = 0; - virtual bool isLoop() const = 0; -}; - -template class ConcreteRegion : public Region { - const T *Region; - -public: - ConcreteRegion(const T *Region) : Region(Region) {} - MachineBasicBlock *getHeader() const override { return Region->getHeader(); } - bool contains(const MachineBasicBlock *MBB) const override { - return Region->contains(MBB); - } - unsigned getNumBlocks() const override { return Region->getNumBlocks(); } - iterator_range blocks() const override { - return Region->blocks(); - } - bool isLoop() const override { return false; } -}; - -template <> bool ConcreteRegion::isLoop() const { return true; } - -// This class has information of nested Regions; this is analogous to what -// LoopInfo is for loops. -class RegionInfo { - const MachineLoopInfo &MLI; - const WebAssemblyExceptionInfo &WEI; - DenseMap> LoopMap; - DenseMap> ExceptionMap; - -public: - RegionInfo(const MachineLoopInfo &MLI, const WebAssemblyExceptionInfo &WEI) - : MLI(MLI), WEI(WEI) {} - - // Returns a smallest loop or exception that contains MBB - const Region *getRegionFor(const MachineBasicBlock *MBB) { - const auto *ML = MLI.getLoopFor(MBB); - const auto *WE = WEI.getExceptionFor(MBB); - if (!ML && !WE) - return nullptr; - // We determine subregion relationship by domination of their headers, i.e., - // if region A's header dominates region B's header, B is a subregion of A. - // WebAssemblyException contains BBs in all its subregions (loops or - // exceptions), but MachineLoop may not, because MachineLoop does not contain - // BBs that don't have a path to its header even if they are dominated by - // its header. So here we should use WE->contains(ML->getHeader()), but not - // ML->contains(WE->getHeader()). - if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) { - // If the smallest region containing MBB is a loop - if (LoopMap.count(ML)) - return LoopMap[ML].get(); - LoopMap[ML] = std::make_unique>(ML); - return LoopMap[ML].get(); - } else { - // If the smallest region containing MBB is an exception - if (ExceptionMap.count(WE)) - return ExceptionMap[WE].get(); - ExceptionMap[WE] = - std::make_unique>(WE); - return ExceptionMap[WE].get(); - } - } -}; - class WebAssemblyCFGSort final : public MachineFunctionPass { StringRef getPassName() const override { return "WebAssembly CFG Sort"; } @@ -236,14 +167,14 @@ struct CompareBlockNumbersBackwards { /// Bookkeeping for a region to help ensure that we don't mix blocks not /// dominated by the its header among its blocks. struct Entry { - const Region *TheRegion; + const SortRegion *TheRegion; unsigned NumBlocksLeft; /// List of blocks not dominated by Loop's header that are deferred until /// after all of Loop's blocks have been seen. std::vector Deferred; - explicit Entry(const class Region *R) + explicit Entry(const SortRegion *R) : TheRegion(R), NumBlocksLeft(R->getNumBlocks()) {} }; } // end anonymous namespace @@ -287,10 +218,10 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, CompareBlockNumbersBackwards> Ready; - RegionInfo RI(MLI, WEI); + SortRegionInfo SRI(MLI, WEI); SmallVector Entries; for (MachineBasicBlock *MBB = &MF.front();;) { - const Region *R = RI.getRegionFor(MBB); + const SortRegion *R = SRI.getRegionFor(MBB); if (R) { // If MBB is a region header, add it to the active region list. We can't // put any blocks that it doesn't dominate until we see the end of the @@ -373,7 +304,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, MF.RenumberBlocks(); #ifndef NDEBUG - SmallSetVector OnStack; + SmallSetVector OnStack; // Insert a sentinel representing the degenerate loop that starts at the // function entry block and includes the entire function as a "loop" that @@ -382,7 +313,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, for (auto &MBB : MF) { assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative."); - const Region *Region = RI.getRegionFor(&MBB); + const SortRegion *Region = SRI.getRegionFor(&MBB); if (Region && &MBB == Region->getHeader()) { // Region header. @@ -408,10 +339,10 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, for (auto Pred : MBB.predecessors()) assert(Pred->getNumber() < MBB.getNumber() && "Non-loop-header predecessors should be topologically sorted"); - assert(OnStack.count(RI.getRegionFor(&MBB)) && + assert(OnStack.count(SRI.getRegionFor(&MBB)) && "Blocks must be nested in their regions"); } - while (OnStack.size() > 1 && &MBB == WebAssembly::getBottom(OnStack.back())) + while (OnStack.size() > 1 && &MBB == SRI.getBottom(OnStack.back())) OnStack.pop_back(); } assert(OnStack.pop_back_val() == nullptr && diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 8cbfc98e81978..02330a2dd4afa 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -24,6 +24,7 @@ #include "WebAssembly.h" #include "WebAssemblyExceptionInfo.h" #include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblySortRegion.h" #include "WebAssemblySubtarget.h" #include "WebAssemblyUtilities.h" #include "llvm/ADT/Statistic.h" @@ -33,6 +34,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; +using WebAssembly::SortRegionInfo; #define DEBUG_TYPE "wasm-cfg-stackify" @@ -382,6 +384,8 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { MachineFunction &MF = *MBB.getParent(); const auto &MLI = getAnalysis(); + const auto &WEI = getAnalysis(); + SortRegionInfo SRI(MLI, WEI); const auto &TII = *MF.getSubtarget().getInstrInfo(); MachineLoop *Loop = MLI.getLoopFor(&MBB); @@ -390,7 +394,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { // The operand of a LOOP is the first block after the loop. If the loop is the // bottom of the function, insert a dummy block at the end. - MachineBasicBlock *Bottom = WebAssembly::getBottom(Loop); + MachineBasicBlock *Bottom = SRI.getBottom(Loop); auto Iter = std::next(Bottom->getIterator()); if (Iter == MF.end()) { getAppendixBlock(MF); @@ -450,7 +454,9 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { MachineFunction &MF = *MBB.getParent(); auto &MDT = getAnalysis(); const auto &TII = *MF.getSubtarget().getInstrInfo(); + const auto &MLI = getAnalysis(); const auto &WEI = getAnalysis(); + SortRegionInfo SRI(MLI, WEI); const auto &MFI = *MF.getInfo(); // Compute the nearest common dominator of all unwind predecessors @@ -470,7 +476,7 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { // end. WebAssemblyException *WE = WEI.getExceptionFor(&MBB); assert(WE); - MachineBasicBlock *Bottom = WebAssembly::getBottom(WE); + MachineBasicBlock *Bottom = SRI.getBottom(WE); auto Iter = std::next(Bottom->getIterator()); if (Iter == MF.end()) { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp index 7f805b34b4995..52aa3534c78ef 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp @@ -41,13 +41,51 @@ class WebAssemblyFixBrTableDefaults final : public MachineFunctionPass { char WebAssemblyFixBrTableDefaults::ID = 0; +// Target indepedent selection dag assumes that it is ok to use PointerTy +// as the index for a "switch", whereas Wasm so far only has a 32-bit br_table. +// See e.g. SelectionDAGBuilder::visitJumpTableHeader +// We have a 64-bit br_table in the tablegen defs as a result, which does get +// selected, and thus we get incorrect truncates/extensions happening on +// wasm64. Here we fix that. +void fixBrTableIndex(MachineInstr &MI, MachineBasicBlock *MBB, + MachineFunction &MF) { + // Only happens on wasm64. + auto &WST = MF.getSubtarget(); + if (!WST.hasAddr64()) + return; + + assert(MI.getDesc().getOpcode() == WebAssembly::BR_TABLE_I64 && + "64-bit br_table pseudo instruction expected"); + + // Find extension op, if any. It sits in the previous BB before the branch. + auto ExtMI = MF.getRegInfo().getVRegDef(MI.getOperand(0).getReg()); + if (ExtMI->getOpcode() == WebAssembly::I64_EXTEND_U_I32) { + // Unnecessarily extending a 32-bit value to 64, remove it. + assert(MI.getOperand(0).getReg() == ExtMI->getOperand(0).getReg()); + MI.getOperand(0).setReg(ExtMI->getOperand(1).getReg()); + ExtMI->eraseFromParent(); + } else { + // Incoming 64-bit value that needs to be truncated. + Register Reg32 = + MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass); + BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(), + WST.getInstrInfo()->get(WebAssembly::I32_WRAP_I64), Reg32) + .addReg(MI.getOperand(0).getReg()); + MI.getOperand(0).setReg(Reg32); + } + + // We now have a 32-bit operand in all cases, so change the instruction + // accordingly. + MI.setDesc(WST.getInstrInfo()->get(WebAssembly::BR_TABLE_I32)); +} + // `MI` is a br_table instruction with a dummy default target argument. This // function finds and adds the default target argument and removes any redundant // range check preceding the br_table. Returns the MBB that the br_table is // moved into so it can be removed from further consideration, or nullptr if the // br_table cannot be optimized. -MachineBasicBlock *fixBrTable(MachineInstr &MI, MachineBasicBlock *MBB, - MachineFunction &MF) { +MachineBasicBlock *fixBrTableDefault(MachineInstr &MI, MachineBasicBlock *MBB, + MachineFunction &MF) { // Get the header block, which contains the redundant range check. assert(MBB->pred_size() == 1 && "Expected a single guard predecessor"); auto *HeaderMBB = *MBB->pred_begin(); @@ -125,7 +163,8 @@ bool WebAssemblyFixBrTableDefaults::runOnMachineFunction(MachineFunction &MF) { MBBSet.erase(MBB); for (auto &MI : *MBB) { if (WebAssembly::isBrTable(MI)) { - auto *Fixed = fixBrTable(MI, MBB, MF); + fixBrTableIndex(MI, MBB, MF); + auto *Fixed = fixBrTableDefault(MI, MBB, MF); if (Fixed != nullptr) { MBBSet.erase(Fixed); Changed = true; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def index dee1c4e281494..5720d3e5afb02 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -29,6 +29,10 @@ HANDLE_NODETYPE(SWIZZLE) HANDLE_NODETYPE(VEC_SHL) HANDLE_NODETYPE(VEC_SHR_S) HANDLE_NODETYPE(VEC_SHR_U) +HANDLE_NODETYPE(WIDEN_LOW_S) +HANDLE_NODETYPE(WIDEN_LOW_U) +HANDLE_NODETYPE(WIDEN_HIGH_S) +HANDLE_NODETYPE(WIDEN_HIGH_U) HANDLE_NODETYPE(THROW) HANDLE_NODETYPE(MEMORY_COPY) HANDLE_NODETYPE(MEMORY_FILL) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index c6519fafbc491..5d377e8aa9bd1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -123,6 +123,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Hoist bitcasts out of shuffles setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + // Combine extends of extract_subvectors into widening ops + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + // Support saturating add for i8x16 and i16x8 for (auto Op : {ISD::SADDSAT, ISD::UADDSAT}) for (auto T : {MVT::v16i8, MVT::v8i16}) @@ -246,6 +250,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal); } + // And some truncating stores are legal as well + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); } // Don't do anything clever with build_pairs @@ -668,6 +675,15 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = Align(8); Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad; return true; + case Intrinsic::wasm_load32_zero: + case Intrinsic::wasm_load64_zero: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = Intrinsic == Intrinsic::wasm_load32_zero ? MVT::i32 : MVT::i64; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Info.memVT == MVT::i32 ? Align(4) : Align(8); + Info.flags = MachineMemOperand::MOLoad; + return true; default: return false; } @@ -1742,6 +1758,49 @@ performVECTOR_SHUFFLECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return DAG.getBitcast(DstType, NewShuffle); } +static SDValue performVectorWidenCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + assert(N->getOpcode() == ISD::SIGN_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND); + + // Combine ({s,z}ext (extract_subvector src, i)) into a widening operation if + // possible before the extract_subvector can be expanded. + auto Extract = N->getOperand(0); + if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + auto Source = Extract.getOperand(0); + auto *IndexNode = dyn_cast(Extract.getOperand(1)); + if (IndexNode == nullptr) + return SDValue(); + auto Index = IndexNode->getZExtValue(); + + // Only v8i8 and v4i16 extracts can be widened, and only if the extracted + // subvector is the low or high half of its source. + EVT ResVT = N->getValueType(0); + if (ResVT == MVT::v8i16) { + if (Extract.getValueType() != MVT::v8i8 || + Source.getValueType() != MVT::v16i8 || (Index != 0 && Index != 8)) + return SDValue(); + } else if (ResVT == MVT::v4i32) { + if (Extract.getValueType() != MVT::v4i16 || + Source.getValueType() != MVT::v8i16 || (Index != 0 && Index != 4)) + return SDValue(); + } else { + return SDValue(); + } + + bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND; + bool IsLow = Index == 0; + + unsigned Op = IsSext ? (IsLow ? WebAssemblyISD::WIDEN_LOW_S + : WebAssemblyISD::WIDEN_HIGH_S) + : (IsLow ? WebAssemblyISD::WIDEN_LOW_U + : WebAssemblyISD::WIDEN_HIGH_U); + + return DAG.getNode(Op, SDLoc(N), ResVT, Source); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -1750,5 +1809,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); case ISD::VECTOR_SHUFFLE: return performVECTOR_SHUFFLECombine(N, DCI); + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return performVectorWidenCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index b3c63cc1f884b..48b934457267e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -70,7 +70,7 @@ defm LOAD_F64 : WebAssemblyLoad; multiclass LoadPatNoOffset { def : Pat<(ty (kind I32:$addr)), (!cast(inst # "_A32") 0, 0, I32:$addr)>, Requires<[HasAddr32]>; - def : Pat<(ty (kind I64:$addr)), (!cast(inst # "_A64") 0, 0, I64:$addr)>, + def : Pat<(ty (kind (i64 I64:$addr))), (!cast(inst # "_A64") 0, 0, I64:$addr)>, Requires<[HasAddr64]>; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index b603701ab930e..cd088751bc8a9 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -163,6 +163,43 @@ defm : LoadPatGlobalAddrOffOnly(exts[0]#types[1]), "LOAD_EXTEND"#exts[1]#"_"#types[0]>; } +// Load lane into zero vector +multiclass SIMDLoadZero simdop> { + let mayLoad = 1, UseNamedOperandTable = 1 in { + defm LOAD_ZERO_#vec_t#_A32 : + SIMD_I<(outs V128:$dst), + (ins P2Align:$p2align, offset32_op:$off, I32:$addr), + (outs), (ins P2Align:$p2align, offset32_op:$off), [], + name#"\t$dst, ${off}(${addr})$p2align", + name#"\t$off$p2align", simdop>; + defm LOAD_ZERO_#vec_t#_A64 : + SIMD_I<(outs V128:$dst), + (ins P2Align:$p2align, offset64_op:$off, I64:$addr), + (outs), (ins P2Align:$p2align, offset64_op:$off), [], + name#"\t$dst, ${off}(${addr})$p2align", + name#"\t$off$p2align", simdop>; + } // mayLoad = 1, UseNamedOperandTable = 1 +} + +// TODO: Also support v4f32 and v2f64 once the instructions are merged +// to the proposal +defm "" : SIMDLoadZero; +defm "" : SIMDLoadZero; + +defm : LoadPatNoOffset; +defm : LoadPatNoOffset; + +defm : LoadPatImmOff; +defm : LoadPatImmOff; + +defm : LoadPatImmOff; +defm : LoadPatImmOff; + +defm : LoadPatOffsetOnly; +defm : LoadPatOffsetOnly; + +defm : LoadPatGlobalAddrOffOnly; +defm : LoadPatGlobalAddrOffOnly; // Store: v128.store let mayStore = 1, UseNamedOperandTable = 1 in { @@ -800,7 +837,7 @@ let isCommutable = 1 in defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins), [(set V128:$dst, (int_wasm_dot V128:$lhs, V128:$rhs))], "i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s", - 180>; + 186>; //===----------------------------------------------------------------------===// // Floating-point unary arithmetic @@ -885,16 +922,28 @@ defm "" : SIMDConvert; defm "" : SIMDConvert; defm "" : SIMDConvert; +// Lower llvm.wasm.trunc.saturate.* to saturating instructions +def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))), + (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>; +def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))), + (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>; + // Widening operations +def widen_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; +def widen_low_s : SDNode<"WebAssemblyISD::WIDEN_LOW_S", widen_t>; +def widen_high_s : SDNode<"WebAssemblyISD::WIDEN_HIGH_S", widen_t>; +def widen_low_u : SDNode<"WebAssemblyISD::WIDEN_LOW_U", widen_t>; +def widen_high_u : SDNode<"WebAssemblyISD::WIDEN_HIGH_U", widen_t>; + multiclass SIMDWiden baseInst> { - defm "" : SIMDConvert; - defm "" : SIMDConvert; - defm "" : SIMDConvert; - defm "" : SIMDConvert; } @@ -921,11 +970,95 @@ multiclass SIMDNarrow; defm "" : SIMDNarrow; -// Lower llvm.wasm.trunc.saturate.* to saturating instructions -def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))), - (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>; -def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))), - (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>; +// Use narrowing operations for truncating stores. Since the narrowing +// operations are saturating instead of truncating, we need to mask +// the stored values first. +// TODO: Use consts instead of splats +def store_v8i8_trunc_v8i16 : + OutPatFrag<(ops node:$val), + (EXTRACT_LANE_v2i64 + (NARROW_U_v16i8 + (AND_v4i32 (SPLAT_v4i32 (CONST_I32 0x00ff00ff)), node:$val), + node:$val // Unused input + ), + 0 + )>; + +def store_v4i16_trunc_v4i32 : + OutPatFrag<(ops node:$val), + (EXTRACT_LANE_v2i64 + (NARROW_U_v8i16 + (AND_v4i32 (SPLAT_v4i32 (CONST_I32 0x0000ffff)), node:$val), + node:$val // Unused input + ), + 0 + )>; + +// Store patterns adapted from WebAssemblyInstrMemory.td +multiclass NarrowingStorePatNoOffset { + def : Pat<(node ty:$val, I32:$addr), + (STORE_I64_A32 0, 0, I32:$addr, (i64 (out ty:$val)))>, + Requires<[HasAddr32]>; + def : Pat<(node ty:$val, I64:$addr), + (STORE_I64_A64 0, 0, I64:$addr, (i64 (out ty:$val)))>, + Requires<[HasAddr64]>; +} + +defm : NarrowingStorePatNoOffset; +defm : NarrowingStorePatNoOffset; + +multiclass NarrowingStorePatImmOff { + def : Pat<(kind ty:$val, (operand I32:$addr, imm:$off)), + (STORE_I64_A32 0, imm:$off, I32:$addr, (i64 (out ty:$val)))>, + Requires<[HasAddr32]>; + def : Pat<(kind ty:$val, (operand I64:$addr, imm:$off)), + (STORE_I64_A64 0, imm:$off, I64:$addr, (i64 (out ty:$val)))>, + Requires<[HasAddr64]>; +} + +defm : NarrowingStorePatImmOff; +defm : NarrowingStorePatImmOff; +defm : NarrowingStorePatImmOff; +defm : NarrowingStorePatImmOff; + +multiclass NarrowingStorePatOffsetOnly { + def : Pat<(kind ty:$val, imm:$off), + (STORE_I64_A32 0, imm:$off, (CONST_I32 0), (i64 (out ty:$val)))>, + Requires<[HasAddr32]>; + def : Pat<(kind ty:$val, imm:$off), + (STORE_I64_A64 0, imm:$off, (CONST_I64 0), (i64 (out ty:$val)))>, + Requires<[HasAddr64]>; +} + +defm : NarrowingStorePatOffsetOnly; +defm : NarrowingStorePatOffsetOnly; + +multiclass NarrowingStorePatGlobalAddrOffOnly { + def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_I64_A32 + 0, tglobaladdr:$off, (CONST_I32 0), (i64 (out ty:$val)))>, + Requires<[IsNotPIC, HasAddr32]>; + def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_I64_A64 + 0, tglobaladdr:$off, (CONST_I64 0), (i64 (out ty:$val)))>, + Requires<[IsNotPIC, HasAddr64]>; +} + +defm : NarrowingStorePatGlobalAddrOffOnly; +defm : NarrowingStorePatGlobalAddrOffOnly; // Bitcasts are nops // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types @@ -942,20 +1075,21 @@ def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>; // Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS) //===----------------------------------------------------------------------===// -multiclass SIMDQFM baseInst> { +multiclass SIMDQFM simdopA, + bits<32> simdopS> { defm QFMA_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), [(set (vec_t V128:$dst), (int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))], - vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>; + vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", simdopA>; defm QFMS_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins), [(set (vec_t V128:$dst), (int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))], - vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>; + vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", simdopS>; } -defm "" : SIMDQFM; -defm "" : SIMDQFM; +defm "" : SIMDQFM; +defm "" : SIMDQFM; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp new file mode 100644 index 0000000000000..cd84e68aed140 --- /dev/null +++ b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp @@ -0,0 +1,78 @@ +#include "WebAssemblySortRegion.h" +#include "WebAssemblyExceptionInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" + +using namespace llvm; +using namespace WebAssembly; + +namespace llvm { +namespace WebAssembly { +template <> +bool ConcreteSortRegion::isLoop() const { + return true; +} +} // end namespace WebAssembly +} // end namespace llvm + +const SortRegion *SortRegionInfo::getRegionFor(const MachineBasicBlock *MBB) { + const auto *ML = MLI.getLoopFor(MBB); + const auto *WE = WEI.getExceptionFor(MBB); + if (!ML && !WE) + return nullptr; + // We determine subregion relationship by domination of their headers, i.e., + // if region A's header dominates region B's header, B is a subregion of A. + // WebAssemblyException contains BBs in all its subregions (loops or + // exceptions), but MachineLoop may not, because MachineLoop does not + // contain BBs that don't have a path to its header even if they are + // dominated by its header. So here we should use + // WE->contains(ML->getHeader()), but not ML->contains(WE->getHeader()). + if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) { + // If the smallest region containing MBB is a loop + if (LoopMap.count(ML)) + return LoopMap[ML].get(); + LoopMap[ML] = std::make_unique>(ML); + return LoopMap[ML].get(); + } else { + // If the smallest region containing MBB is an exception + if (ExceptionMap.count(WE)) + return ExceptionMap[WE].get(); + ExceptionMap[WE] = + std::make_unique>(WE); + return ExceptionMap[WE].get(); + } +} + +MachineBasicBlock *SortRegionInfo::getBottom(const SortRegion *R) { + if (R->isLoop()) + return getBottom(MLI.getLoopFor(R->getHeader())); + else + return getBottom(WEI.getExceptionFor(R->getHeader())); +} + +MachineBasicBlock *SortRegionInfo::getBottom(const MachineLoop *ML) { + MachineBasicBlock *Bottom = ML->getHeader(); + for (MachineBasicBlock *MBB : ML->blocks()) { + if (MBB->getNumber() > Bottom->getNumber()) + Bottom = MBB; + // MachineLoop does not contain all BBs dominated by its header. BBs that + // don't have a path back to the loop header aren't included. But for the + // purpose of CFG sorting and stackification, we need a bottom BB among all + // BBs that are dominated by the loop header. So we check if there is any + // WebAssemblyException contained in this loop, and computes the most bottom + // BB of them all. + if (MBB->isEHPad()) { + MachineBasicBlock *ExBottom = getBottom(WEI.getExceptionFor(MBB)); + if (ExBottom->getNumber() > Bottom->getNumber()) + Bottom = ExBottom; + } + } + return Bottom; +} + +MachineBasicBlock *SortRegionInfo::getBottom(const WebAssemblyException *WE) { + MachineBasicBlock *Bottom = WE->getHeader(); + for (MachineBasicBlock *MBB : WE->blocks()) + if (MBB->getNumber() > Bottom->getNumber()) + Bottom = MBB; + return Bottom; +} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h new file mode 100644 index 0000000000000..e92bf17641854 --- /dev/null +++ b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.h @@ -0,0 +1,91 @@ +//===-- WebAssemblySortRegion.h - WebAssembly Sort SortRegion ----*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements regions used in CFGSort and CFGStackify. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSORTREGION_H +#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSORTREGION_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/iterator_range.h" + +namespace llvm { + +class MachineBasicBlock; +class MachineLoop; +class MachineLoopInfo; +class WebAssemblyException; +class WebAssemblyExceptionInfo; + +namespace WebAssembly { + +// Wrapper for loops and exceptions +class SortRegion { +public: + virtual ~SortRegion() = default; + virtual MachineBasicBlock *getHeader() const = 0; + virtual bool contains(const MachineBasicBlock *MBB) const = 0; + virtual unsigned getNumBlocks() const = 0; + using block_iterator = typename ArrayRef::const_iterator; + virtual iterator_range blocks() const = 0; + virtual bool isLoop() const = 0; +}; + +template class ConcreteSortRegion : public SortRegion { + const T *Unit; + +public: + ConcreteSortRegion(const T *Unit) : Unit(Unit) {} + MachineBasicBlock *getHeader() const override { return Unit->getHeader(); } + bool contains(const MachineBasicBlock *MBB) const override { + return Unit->contains(MBB); + } + unsigned getNumBlocks() const override { return Unit->getNumBlocks(); } + iterator_range blocks() const override { + return Unit->blocks(); + } + bool isLoop() const override { return false; } +}; + +// This class has information of nested SortRegions; this is analogous to what +// LoopInfo is for loops. +class SortRegionInfo { + friend class ConcreteSortRegion; + friend class ConcreteSortRegion; + + const MachineLoopInfo &MLI; + const WebAssemblyExceptionInfo &WEI; + DenseMap> LoopMap; + DenseMap> + ExceptionMap; + +public: + SortRegionInfo(const MachineLoopInfo &MLI, + const WebAssemblyExceptionInfo &WEI) + : MLI(MLI), WEI(WEI) {} + + // Returns a smallest loop or exception that contains MBB + const SortRegion *getRegionFor(const MachineBasicBlock *MBB); + + // Return the "bottom" block among all blocks dominated by the region + // (MachineLoop or WebAssemblyException) header. This works when the entity is + // discontiguous. + MachineBasicBlock *getBottom(const SortRegion *R); + MachineBasicBlock *getBottom(const MachineLoop *ML); + MachineBasicBlock *getBottom(const WebAssemblyException *WE); +}; + +} // end namespace WebAssembly + +} // end namespace llvm + +#endif diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h index 4f0ed43a24816..337077178557d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" namespace llvm { @@ -33,17 +34,6 @@ extern const char *const CxaRethrowFn; extern const char *const StdTerminateFn; extern const char *const PersonalityWrapperFn; -/// Return the "bottom" block of an entity, which can be either a MachineLoop or -/// WebAssemblyException. This differs from MachineLoop::getBottomBlock in that -/// it works even if the entity is discontiguous. -template MachineBasicBlock *getBottom(const T *Unit) { - MachineBasicBlock *Bottom = Unit->getHeader(); - for (MachineBasicBlock *MBB : Unit->blocks()) - if (MBB->getNumber() > Bottom->getNumber()) - Bottom = MBB; - return Bottom; -} - /// Returns the operand number of a callee, assuming the argument is a call /// instruction. const MachineOperand &getCalleeOp(const MachineInstr &MI); diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index a3014b2aba92c..ddb13e46e9305 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -87,6 +87,14 @@ class X86AsmParser : public MCTargetAsmParser { VEXEncoding ForcedVEXEncoding = VEXEncoding_Default; + enum DispEncoding { + DispEncoding_Default, + DispEncoding_Disp8, + DispEncoding_Disp32, + }; + + DispEncoding ForcedDispEncoding = DispEncoding_Default; + private: SMLoc consumeToken() { MCAsmParser &Parser = getParser(); @@ -878,11 +886,6 @@ class X86AsmParser : public MCTargetAsmParser { return Parser.Error(L, Msg, Range); } - std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg, SMRange R = SMRange()) { - Error(Loc, Msg, R); - return nullptr; - } - bool MatchRegisterByName(unsigned &RegNo, StringRef RegName, SMLoc StartLoc, SMLoc EndLoc); bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc, @@ -898,15 +901,15 @@ class X86AsmParser : public MCTargetAsmParser { std::unique_ptr &&Dst); bool VerifyAndAdjustOperands(OperandVector &OrigOperands, OperandVector &FinalOperands); - std::unique_ptr ParseOperand(); - std::unique_ptr ParseATTOperand(); - std::unique_ptr ParseIntelOperand(); + bool ParseOperand(OperandVector &Operands); + bool ParseATTOperand(OperandVector &Operands); + bool ParseIntelOperand(OperandVector &Operands); bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, InlineAsmIdentifierInfo &Info, SMLoc &End); bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End); unsigned IdentifyIntelInlineAsmOperator(StringRef Name); unsigned ParseIntelInlineAsmOperator(unsigned OpKind); - std::unique_ptr ParseRoundingModeOp(SMLoc Start); + bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands); bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM, bool &ParseError, SMLoc &End); void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start, @@ -917,20 +920,21 @@ class X86AsmParser : public MCTargetAsmParser { bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator = false); - std::unique_ptr ParseMemOperand(unsigned SegReg, - const MCExpr *&Disp, - const SMLoc &StartLoc, - SMLoc &EndLoc); + bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc, + SMLoc EndLoc, OperandVector &Operands); X86::CondCode ParseConditionCode(StringRef CCode); bool ParseIntelMemoryOperandSize(unsigned &Size); - std::unique_ptr - CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, - unsigned IndexReg, unsigned Scale, SMLoc Start, - SMLoc End, unsigned Size, StringRef Identifier, - const InlineAsmIdentifierInfo &Info); - + bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, + unsigned Size, StringRef Identifier, + const InlineAsmIdentifierInfo &Info, + OperandVector &Operands); + + bool parseDirectiveArch(); + bool parseDirectiveNops(SMLoc L); bool parseDirectiveEven(SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); @@ -992,8 +996,7 @@ class X86AsmParser : public MCTargetAsmParser { /// Parses AVX512 specific operand primitives: masked registers ({%k}, {z}) /// and memory broadcasting ({1to}) primitives, updating Operands vector if required. /// return false if no parsing errors occurred, true otherwise. - bool HandleAVX512Operand(OperandVector &Operands, - const MCParsedAsmOperand &Op); + bool HandleAVX512Operand(OperandVector &Operands); bool ParseZ(std::unique_ptr &Z, const SMLoc &StartLoc); @@ -1524,16 +1527,17 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands, return false; } -std::unique_ptr X86AsmParser::ParseOperand() { +bool X86AsmParser::ParseOperand(OperandVector &Operands) { if (isParsingIntelSyntax()) - return ParseIntelOperand(); - return ParseATTOperand(); + return ParseIntelOperand(Operands); + + return ParseATTOperand(Operands); } -std::unique_ptr X86AsmParser::CreateMemForMSInlineAsm( +bool X86AsmParser::CreateMemForMSInlineAsm( unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, - const InlineAsmIdentifierInfo &Info) { + const InlineAsmIdentifierInfo &Info, OperandVector &Operands) { // If we found a decl other than a VarDecl, then assume it is a FuncDecl or // some other label reference. if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) { @@ -1545,8 +1549,10 @@ std::unique_ptr X86AsmParser::CreateMemForMSInlineAsm( } // Create an absolute memory reference in order to match against // instructions taking a PC relative operand. - return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size, - Identifier, Info.Label.Decl); + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start, + End, Size, Identifier, + Info.Label.Decl)); + return false; } // We either have a direct symbol reference, or an offset from a symbol. The // parser always puts the symbol on the LHS, so look there for size @@ -1563,17 +1569,19 @@ std::unique_ptr X86AsmParser::CreateMemForMSInlineAsm( // It is widely common for MS InlineAsm to use a global variable and one/two // registers in a mmory expression, and though unaccessible via rip/eip. if (IsGlobalLV && (BaseReg || IndexReg)) { - return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End); + Operands.push_back( + X86Operand::CreateMem(getPointerWidth(), Disp, Start, End)); + return false; + } // Otherwise, we set the base register to a non-zero value // if we don't know the actual value at this time. This is necessary to // get the matching correct in some cases. - } else { - BaseReg = BaseReg ? BaseReg : 1; - return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, - IndexReg, Scale, Start, End, Size, - /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, - FrontendSize); - } + BaseReg = BaseReg ? BaseReg : 1; + Operands.push_back(X86Operand::CreateMem( + getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End, + Size, + /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, FrontendSize)); + return false; } // Some binary bitwise operators have a named synonymous @@ -1904,14 +1912,13 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier( } //ParseRoundingModeOp - Parse AVX-512 rounding mode operand -std::unique_ptr -X86AsmParser::ParseRoundingModeOp(SMLoc Start) { +bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); // Eat "{" and mark the current place. const SMLoc consumedToken = consumeToken(); if (Tok.isNot(AsmToken::Identifier)) - return ErrorOperand(Tok.getLoc(), "Expected an identifier after {"); + return Error(Tok.getLoc(), "Expected an identifier after {"); if (Tok.getIdentifier().startswith("r")){ int rndMode = StringSwitch(Tok.getIdentifier()) .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT) @@ -1920,28 +1927,30 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) { .Case("rz", X86::STATIC_ROUNDING::TO_ZERO) .Default(-1); if (-1 == rndMode) - return ErrorOperand(Tok.getLoc(), "Invalid rounding mode."); + return Error(Tok.getLoc(), "Invalid rounding mode."); Parser.Lex(); // Eat "r*" of r*-sae if (!getLexer().is(AsmToken::Minus)) - return ErrorOperand(Tok.getLoc(), "Expected - at this point"); + return Error(Tok.getLoc(), "Expected - at this point"); Parser.Lex(); // Eat "-" Parser.Lex(); // Eat the sae if (!getLexer().is(AsmToken::RCurly)) - return ErrorOperand(Tok.getLoc(), "Expected } at this point"); + return Error(Tok.getLoc(), "Expected } at this point"); SMLoc End = Tok.getEndLoc(); Parser.Lex(); // Eat "}" const MCExpr *RndModeOp = MCConstantExpr::create(rndMode, Parser.getContext()); - return X86Operand::CreateImm(RndModeOp, Start, End); + Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End)); + return false; } if(Tok.getIdentifier().equals("sae")){ Parser.Lex(); // Eat the sae if (!getLexer().is(AsmToken::RCurly)) - return ErrorOperand(Tok.getLoc(), "Expected } at this point"); + return Error(Tok.getLoc(), "Expected } at this point"); Parser.Lex(); // Eat "}" - return X86Operand::CreateToken("{sae}", consumedToken); + Operands.push_back(X86Operand::CreateToken("{sae}", consumedToken)); + return false; } - return ErrorOperand(Tok.getLoc(), "unknown token in expression"); + return Error(Tok.getLoc(), "unknown token in expression"); } /// Parse the '.' operator. @@ -2076,7 +2085,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { return false; } -std::unique_ptr X86AsmParser::ParseIntelOperand() { +bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; @@ -2084,28 +2093,31 @@ std::unique_ptr X86AsmParser::ParseIntelOperand() { // Parse optional Size directive. unsigned Size; if (ParseIntelMemoryOperandSize(Size)) - return nullptr; + return true; bool PtrInOperand = bool(Size); Start = Tok.getLoc(); // Rounding mode operand. if (getLexer().is(AsmToken::LCurly)) - return ParseRoundingModeOp(Start); + return ParseRoundingModeOp(Start, Operands); // Register operand. unsigned RegNo = 0; if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) { if (RegNo == X86::RIP) - return ErrorOperand(Start, "rip can only be used as a base register"); + return Error(Start, "rip can only be used as a base register"); // A Register followed by ':' is considered a segment override - if (Tok.isNot(AsmToken::Colon)) - return !PtrInOperand ? X86Operand::CreateReg(RegNo, Start, End) : - ErrorOperand(Start, "expected memory operand after 'ptr', " + if (Tok.isNot(AsmToken::Colon)) { + if (PtrInOperand) + return Error(Start, "expected memory operand after 'ptr', " "found register operand instead"); + Operands.push_back(X86Operand::CreateReg(RegNo, Start, End)); + return false; + } // An alleged segment override. check if we have a valid segment register if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo)) - return ErrorOperand(Start, "invalid segment register"); + return Error(Start, "invalid segment register"); // Eat ':' and update Start location Start = Lex().getLoc(); } @@ -2113,7 +2125,7 @@ std::unique_ptr X86AsmParser::ParseIntelOperand() { // Immediates and Memory IntelExprStateMachine SM; if (ParseIntelExpression(SM, End)) - return nullptr; + return true; if (isParsingMSInlineAsm()) RewriteIntelExpression(SM, Start, Tok.getLoc()); @@ -2134,12 +2146,15 @@ std::unique_ptr X86AsmParser::ParseIntelOperand() { if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) { // Disp includes the address of a variable; make sure this is recorded // for later handling. - return X86Operand::CreateImm(Disp, Start, End, SM.getSymName(), - Info.Var.Decl, Info.Var.IsGlobalLV); + Operands.push_back(X86Operand::CreateImm(Disp, Start, End, + SM.getSymName(), Info.Var.Decl, + Info.Var.IsGlobalLV)); + return false; } } - return X86Operand::CreateImm(Disp, Start, End); + Operands.push_back(X86Operand::CreateImm(Disp, Start, End)); + return false; } StringRef ErrMsg; @@ -2164,7 +2179,7 @@ std::unique_ptr X86AsmParser::ParseIntelOperand() { if (Scale != 0 && X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) - return ErrorOperand(Start, "16-bit addresses cannot have a scale"); + return Error(Start, "16-bit addresses cannot have a scale"); // If there was no explicit scale specified, change it to 1. if (Scale == 0) @@ -2180,26 +2195,33 @@ std::unique_ptr X86AsmParser::ParseIntelOperand() { if ((BaseReg || IndexReg) && CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(), ErrMsg)) - return ErrorOperand(Start, ErrMsg); + return Error(Start, ErrMsg); if (isParsingMSInlineAsm()) return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start, End, Size, SM.getSymName(), - SM.getIdentifierInfo()); + SM.getIdentifierInfo(), Operands); // When parsing x64 MS-style assembly, all memory operands default to // RIP-relative when interpreted as non-absolute references. - if (Parser.isParsingMasm() && is64BitMode()) - return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, BaseReg, - IndexReg, Scale, Start, End, Size, - /*DefaultBaseReg=*/X86::RIP); - - if (!(BaseReg || IndexReg || RegNo)) - return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size); - return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, - BaseReg, IndexReg, Scale, Start, End, Size); + if (Parser.isParsingMasm() && is64BitMode()) { + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, + BaseReg, IndexReg, Scale, Start, + End, Size, + /*DefaultBaseReg=*/X86::RIP)); + return false; + } + + if ((BaseReg || IndexReg || RegNo)) + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, + BaseReg, IndexReg, Scale, Start, + End, Size)); + else + Operands.push_back( + X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size)); + return false; } -std::unique_ptr X86AsmParser::ParseATTOperand() { +bool X86AsmParser::ParseATTOperand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); switch (getLexer().getKind()) { case AsmToken::Dollar: { @@ -2214,12 +2236,13 @@ std::unique_ptr X86AsmParser::ParseATTOperand() { "expected immediate expression") || getParser().parseExpression(Val, End) || check(isa(Val), L, "expected immediate expression")) - return nullptr; - return X86Operand::CreateImm(Val, Start, End); + return true; + Operands.push_back(X86Operand::CreateImm(Val, Start, End)); + return false; } case AsmToken::LCurly: { SMLoc Start = Parser.getTok().getLoc(); - return ParseRoundingModeOp(Start); + return ParseRoundingModeOp(Start, Operands); } default: { // This a memory operand or a register. We have some parsing complications @@ -2233,7 +2256,7 @@ std::unique_ptr X86AsmParser::ParseATTOperand() { if (getLexer().isNot(AsmToken::LParen)) { // No '(' so this is either a displacement expression or a register. if (Parser.parseExpression(Expr, EndLoc)) - return nullptr; + return true; if (auto *RE = dyn_cast(Expr)) { // Segment Register. Reset Expr and copy value to register. Expr = nullptr; @@ -2241,21 +2264,27 @@ std::unique_ptr X86AsmParser::ParseATTOperand() { // Sanity check register. if (Reg == X86::EIZ || Reg == X86::RIZ) - return ErrorOperand( + return Error( Loc, "%eiz and %riz can only be used as index registers", SMRange(Loc, EndLoc)); if (Reg == X86::RIP) - return ErrorOperand(Loc, "%rip can only be used as a base register", - SMRange(Loc, EndLoc)); + return Error(Loc, "%rip can only be used as a base register", + SMRange(Loc, EndLoc)); // Return register that are not segment prefixes immediately. - if (!Parser.parseOptionalToken(AsmToken::Colon)) - return X86Operand::CreateReg(Reg, Loc, EndLoc); + if (!Parser.parseOptionalToken(AsmToken::Colon)) { + Operands.push_back(X86Operand::CreateReg(Reg, Loc, EndLoc)); + return false; + } if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(Reg)) - return ErrorOperand(Loc, "invalid segment register"); + return Error(Loc, "invalid segment register"); + // Accept a '*' absolute memory reference after the segment. Place it + // before the full memory operand. + if (getLexer().is(AsmToken::Star)) + Operands.push_back(X86Operand::CreateToken("*", consumeToken())); } } // This is a Memory operand. - return ParseMemOperand(Reg, Expr, Loc, EndLoc); + return ParseMemOperand(Reg, Expr, Loc, EndLoc, Operands); } } } @@ -2305,8 +2334,7 @@ bool X86AsmParser::ParseZ(std::unique_ptr &Z, } // true on failure, false otherwise -bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, - const MCParsedAsmOperand &Op) { +bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. @@ -2390,10 +2418,9 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, /// ParseMemOperand: 'seg : disp(basereg, indexreg, scale)'. The '%ds:' prefix /// has already been parsed if present. disp may be provided as well. -std::unique_ptr X86AsmParser::ParseMemOperand(unsigned SegReg, - const MCExpr *&Disp, - const SMLoc &StartLoc, - SMLoc &EndLoc) { +bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, + SMLoc StartLoc, SMLoc EndLoc, + OperandVector &Operands) { MCAsmParser &Parser = getParser(); SMLoc Loc; // Based on the initial passed values, we may be in any of these cases, we are @@ -2455,7 +2482,7 @@ std::unique_ptr X86AsmParser::ParseMemOperand(unsigned SegReg, // Parse immediate if we're not at a mem operand yet. if (!isAtMemOperand()) { if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(Disp, EndLoc)) - return nullptr; + return true; assert(!isa(Disp) && "Expected non-register here."); } else { // Disp is implicitly zero if we haven't parsed it yet. @@ -2468,9 +2495,12 @@ std::unique_ptr X86AsmParser::ParseMemOperand(unsigned SegReg, if (!parseOptionalToken(AsmToken::LParen)) { if (SegReg == 0) - return X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc); - return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, - StartLoc, EndLoc); + Operands.push_back( + X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc)); + else + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, + 0, 0, 1, StartLoc, EndLoc)); + return false; } // If we reached here, then eat the '(' and Process @@ -2484,14 +2514,13 @@ std::unique_ptr X86AsmParser::ParseMemOperand(unsigned SegReg, if (getLexer().isNot(AsmToken::Comma) && getLexer().isNot(AsmToken::RParen)) { if (Parser.parseExpression(E, EndLoc) || check(!isa(E), BaseLoc, "expected register here")) - return nullptr; + return true; // Sanity check register. BaseReg = cast(E)->getRegNo(); if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) - return ErrorOperand(BaseLoc, - "eiz and riz can only be used as index registers", - SMRange(BaseLoc, EndLoc)); + return Error(BaseLoc, "eiz and riz can only be used as index registers", + SMRange(BaseLoc, EndLoc)); } if (parseOptionalToken(AsmToken::Comma)) { @@ -2503,14 +2532,14 @@ std::unique_ptr X86AsmParser::ParseMemOperand(unsigned SegReg, // "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this. if (getLexer().isNot(AsmToken::RParen)) { if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(E, EndLoc)) - return nullptr; + return true; if (!isa(E)) { // We've parsed an unexpected Scale Value instead of an index // register. Interpret it as an absolute. int64_t ScaleVal; if (!E->evaluateAsAbsolute(ScaleVal, getStreamer().getAssemblerPtr())) - return ErrorOperand(Loc, "expected absolute expression"); + return Error(Loc, "expected absolute expression"); if (ScaleVal != 1) Warning(Loc, "scale factor without index register is ignored"); Scale = 1; @@ -2518,10 +2547,10 @@ std::unique_ptr X86AsmParser::ParseMemOperand(unsigned SegReg, IndexReg = cast(E)->getRegNo(); if (BaseReg == X86::RIP) - return ErrorOperand( - Loc, "%rip as base register can not have an index register"); + return Error(Loc, + "%rip as base register can not have an index register"); if (IndexReg == X86::RIP) - return ErrorOperand(Loc, "%rip is not allowed as an index register"); + return Error(Loc, "%rip is not allowed as an index register"); if (parseOptionalToken(AsmToken::Comma)) { // Parse the scale amount: @@ -2532,15 +2561,14 @@ std::unique_ptr X86AsmParser::ParseMemOperand(unsigned SegReg, int64_t ScaleVal; if (Parser.parseTokenLoc(Loc) || Parser.parseAbsoluteExpression(ScaleVal)) - return ErrorOperand(Loc, "expected scale expression"); + return Error(Loc, "expected scale expression"); Scale = (unsigned)ScaleVal; // Validate the scale amount. if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && Scale != 1) - return ErrorOperand(Loc, - "scale factor in 16-bit address must be 1"); + return Error(Loc, "scale factor in 16-bit address must be 1"); if (checkScale(Scale, ErrMsg)) - return ErrorOperand(Loc, ErrMsg); + return Error(Loc, ErrMsg); } } } @@ -2549,23 +2577,30 @@ std::unique_ptr X86AsmParser::ParseMemOperand(unsigned SegReg, // Ok, we've eaten the memory operand, verify we have a ')' and eat it too. if (parseToken(AsmToken::RParen, "unexpected token in memory operand")) - return nullptr; + return true; // This is to support otherwise illegal operand (%dx) found in various // unofficial manuals examples (e.g. "out[s]?[bwl]? %al, (%dx)") and must now // be supported. Mark such DX variants separately fix only in special cases. if (BaseReg == X86::DX && IndexReg == 0 && Scale == 1 && SegReg == 0 && - isa(Disp) && cast(Disp)->getValue() == 0) - return X86Operand::CreateDXReg(BaseLoc, BaseLoc); + isa(Disp) && + cast(Disp)->getValue() == 0) { + Operands.push_back(X86Operand::CreateDXReg(BaseLoc, BaseLoc)); + return false; + } if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(), ErrMsg)) - return ErrorOperand(BaseLoc, ErrMsg); + return Error(BaseLoc, ErrMsg); if (SegReg || BaseReg || IndexReg) - return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, - IndexReg, Scale, StartLoc, EndLoc); - return X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc); + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, + BaseReg, IndexReg, Scale, StartLoc, + EndLoc)); + else + Operands.push_back( + X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc)); + return false; } // Parse either a standard primary expression or a register. @@ -2592,6 +2627,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Reset the forced VEX encoding. ForcedVEXEncoding = VEXEncoding_Default; + ForcedDispEncoding = DispEncoding_Default; // Parse pseudo prefixes. while (1) { @@ -2610,6 +2646,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, ForcedVEXEncoding = VEXEncoding_VEX3; else if (Prefix == "evex") ForcedVEXEncoding = VEXEncoding_EVEX; + else if (Prefix == "disp8") + ForcedDispEncoding = DispEncoding_Disp8; + else if (Prefix == "disp32") + ForcedDispEncoding = DispEncoding_Disp32; else return Error(NameLoc, "unknown prefix"); @@ -2878,13 +2918,11 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Read the operands. while(1) { - if (std::unique_ptr Op = ParseOperand()) { - Operands.push_back(std::move(Op)); - if (HandleAVX512Operand(Operands, *Operands.back())) - return true; - } else { - return true; - } + if (ParseOperand(Operands)) + return true; + if (HandleAVX512Operand(Operands)) + return true; + // check for comma and eat it if (getLexer().is(AsmToken::Comma)) Parser.Lex(); @@ -3118,6 +3156,26 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; + case X86::JMP_1: + // {disp32} forces a larger displacement as if the instruction was relaxed. + // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}. + // This matches GNU assembler. + if (ForcedDispEncoding == DispEncoding_Disp32) { + Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4); + return true; + } + + return false; + case X86::JCC_1: + // {disp32} forces a larger displacement as if the instruction was relaxed. + // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}. + // This matches GNU assembler. + if (ForcedDispEncoding == DispEncoding_Disp32) { + Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4); + return true; + } + + return false; case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: @@ -3515,6 +3573,12 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, if (ForcedVEXEncoding == VEXEncoding_VEX3) Prefixes |= X86::IP_USE_VEX3; + // Set encoded flags for {disp8} and {disp32}. + if (ForcedDispEncoding == DispEncoding_Disp8) + Prefixes |= X86::IP_USE_DISP8; + else if (ForcedDispEncoding == DispEncoding_Disp32) + Prefixes |= X86::IP_USE_DISP32; + if (Prefixes) Inst.setFlags(Prefixes); @@ -3749,6 +3813,12 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, if (ForcedVEXEncoding == VEXEncoding_VEX3) Prefixes |= X86::IP_USE_VEX3; + // Set encoded flags for {disp8} and {disp32}. + if (ForcedDispEncoding == DispEncoding_Disp8) + Prefixes |= X86::IP_USE_DISP8; + else if (ForcedDispEncoding == DispEncoding_Disp32) + Prefixes |= X86::IP_USE_DISP32; + if (Prefixes) Inst.setFlags(Prefixes); @@ -3942,6 +4012,8 @@ bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) { bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { MCAsmParser &Parser = getParser(); StringRef IDVal = DirectiveID.getIdentifier(); + if (IDVal.startswith(".arch")) + return parseDirectiveArch(); if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); else if (IDVal.startswith(".att_syntax")) { @@ -3966,7 +4038,9 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { "a '%' prefix in .intel_syntax"); } return false; - } else if (IDVal == ".even") + } else if (IDVal == ".nops") + return parseDirectiveNops(DirectiveID.getLoc()); + else if (IDVal == ".even") return parseDirectiveEven(DirectiveID.getLoc()); else if (IDVal == ".cv_fpo_proc") return parseDirectiveFPOProc(DirectiveID.getLoc()); @@ -3996,6 +4070,48 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return true; } +bool X86AsmParser::parseDirectiveArch() { + // Ignore .arch for now. + getParser().parseStringToEndOfStatement(); + return false; +} + +/// parseDirectiveNops +/// ::= .nops size[, control] +bool X86AsmParser::parseDirectiveNops(SMLoc L) { + int64_t NumBytes = 0, Control = 0; + SMLoc NumBytesLoc, ControlLoc; + const MCSubtargetInfo STI = getSTI(); + NumBytesLoc = getTok().getLoc(); + if (getParser().checkForValidSection() || + getParser().parseAbsoluteExpression(NumBytes)) + return true; + + if (parseOptionalToken(AsmToken::Comma)) { + ControlLoc = getTok().getLoc(); + if (getParser().parseAbsoluteExpression(Control)) + return true; + } + if (getParser().parseToken(AsmToken::EndOfStatement, + "unexpected token in '.nops' directive")) + return true; + + if (NumBytes <= 0) { + Error(NumBytesLoc, "'.nops' directive with non-positive size"); + return false; + } + + if (Control < 0) { + Error(ControlLoc, "'.nops' directive with negative NOP size"); + return false; + } + + /// Emit nops + getParser().getStreamer().emitNops(NumBytes, Control, L); + + return false; +} + /// parseDirectiveEven /// ::= .even bool X86AsmParser::parseDirectiveEven(SMLoc L) { diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 2b0cbb676c129..677d0a55336a1 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -28,7 +28,6 @@ set(sources X86CallingConv.cpp X86CallLowering.cpp X86CmovConversion.cpp - X86CondBrFolding.cpp X86DomainReassignment.cpp X86DiscriminateMemOps.cpp X86ExpandPseudo.cpp diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index bf3b6bcb5463f..31bc54f53d027 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -207,6 +207,8 @@ class X86AsmBackend : public MCAsmBackend { void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override; + unsigned getMaximumNopSize() const override; + bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; } // end anonymous namespace @@ -1067,6 +1069,21 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm, } } +unsigned X86AsmBackend::getMaximumNopSize() const { + if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) + return 1; + if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP]) + return 7; + if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) + return 15; + if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) + return 11; + // FIXME: handle 32-bit mode + // 15-bytes is the longest single NOP instruction, but 10-bytes is + // commonly the longest that can be efficiently decoded. + return 10; +} + /// Write a sequence of optimal nops to the output, covering \p Count /// bytes. /// \return - true on success, false on failure @@ -1094,23 +1111,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", }; - // This CPU doesn't support long nops. If needed add more. - // FIXME: We could generated something better than plain 0x90. - if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) { - for (uint64_t i = 0; i < Count; ++i) - OS << '\x90'; - return true; - } - - // 15-bytes is the longest single NOP instruction, but 10-bytes is - // commonly the longest that can be efficiently decoded. - uint64_t MaxNopLength = 10; - if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP]) - MaxNopLength = 7; - else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) - MaxNopLength = 15; - else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) - MaxNopLength = 11; + uint64_t MaxNopLength = (uint64_t)getMaximumNopSize(); // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining // length. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 79f07d3c7792a..b6e8d4813d4ca 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -62,6 +62,8 @@ namespace X86 { IP_HAS_LOCK = 16, IP_HAS_NOTRACK = 32, IP_USE_VEX3 = 64, + IP_USE_DISP8 = 128, + IP_USE_DISP32 = 256, }; enum OperandType : unsigned { diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 7dea0760a8310..0de94cda2d739 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -113,33 +113,28 @@ static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) { } } -/// \returns true if this signed displacement fits in a 8-bit sign-extended -/// field. -static bool isDisp8(int Value) { return Value == (int8_t)Value; } - -/// \returns true if this signed displacement fits in a 8-bit compressed -/// dispacement field. -static bool isCDisp8(uint64_t TSFlags, int Value, int &CValue) { - assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) && - "Compressed 8-bit displacement is only valid for EVEX inst."); +/// Determine if this immediate can fit in a disp8 or a compressed disp8 for +/// EVEX instructions. \p will be set to the value to pass to the ImmOffset +/// parameter of emitImmediate. +static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) { + bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; - unsigned CD8_Scale = + int CD8_Scale = (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift; - if (CD8_Scale == 0) { - CValue = Value; - return isDisp8(Value); - } + if (!HasEVEX || CD8_Scale == 0) + return isInt<8>(Value); + + assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!"); + if (Value & (CD8_Scale - 1)) // Unaligned offset + return false; - unsigned Mask = CD8_Scale - 1; - assert((CD8_Scale & Mask) == 0 && "Invalid memory object size."); - if (Value & Mask) // Unaligned offset + int CDisp8 = Value / CD8_Scale; + if (!isInt<8>(CDisp8)) return false; - Value /= (int)CD8_Scale; - bool Ret = (Value == (int8_t)Value); - if (Ret) - CValue = Value; - return Ret; + // ImmOffset will be added to Value in emitImmediate leaving just CDisp8. + ImmOffset = CDisp8 - Value; + return true; } /// \returns the appropriate fixup kind to use for an immediate in an @@ -164,17 +159,20 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) { /// \returns true if the specified instruction has a 16-bit memory operand. static bool is16BitMemOperand(const MCInst &MI, unsigned Op, const MCSubtargetInfo &STI) { - const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg); - const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); + const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp); - if (STI.hasFeature(X86::Mode16Bit) && BaseReg.getReg() == 0 && Disp.isImm() && - Disp.getImm() < 0x10000) + unsigned BaseReg = Base.getReg(); + unsigned IndexReg = Index.getReg(); + + if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0 && + Disp.isImm() && Disp.getImm() < 0x10000) return true; - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) + if ((BaseReg != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) || + (IndexReg != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))) return true; return false; } @@ -390,7 +388,6 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt); const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); unsigned BaseReg = Base.getReg(); - bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; // Handle %rip relative addressing. if (BaseReg == X86::RIP || @@ -484,7 +481,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1); } - if (Disp.isImm() && isDisp8(Disp.getImm())) { + if (Disp.isImm() && isInt<8>(Disp.getImm())) { if (Disp.getImm() == 0 && RMfield != 6) { // There is no displacement; just the register. emitByte(modRMByte(0, RegOpcodeField, RMfield), OS); @@ -498,6 +495,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // This is the [REG]+disp16 case. emitByte(modRMByte(2, RegOpcodeField, RMfield), OS); } else { + assert(IndexReg.getReg() == 0 && "Unexpected index register!"); // There is no BaseReg; this is the plain [disp16] case. emitByte(modRMByte(0, RegOpcodeField, 6), OS); } @@ -507,12 +505,18 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, return; } - // Determine whether a SIB byte is needed. - // If no BaseReg, issue a RIP relative instruction only if the MCE can - // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table - // 2-7) and absolute references. + // Check for presence of {disp8} or {disp32} pseudo prefixes. + bool UseDisp8 = MI.getFlags() & X86::IP_USE_DISP8; + bool UseDisp32 = MI.getFlags() & X86::IP_USE_DISP32; - if ( // The SIB byte must be used if there is an index register. + // We only allow no displacement if no pseudo prefix is present. + bool AllowNoDisp = !UseDisp8 && !UseDisp32; + // Disp8 is allowed unless the {disp32} prefix is present. + bool AllowDisp8 = !UseDisp32; + + // Determine whether a SIB byte is needed. + if (// The SIB byte must be used if there is an index register or the + // encoding requires a SIB byte. !ForceSIB && IndexReg.getReg() == 0 && // The SIB byte must be used if the base is ESP/RSP/R12, all of which // encode to an R/M value of 4, which indicates that a SIB byte is @@ -528,12 +532,12 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, return; } - // If the base is not EBP/ESP and there is no displacement, use simple - // indirect register encoding, this handles addresses like [EAX]. The - // encoding for [EBP] with no displacement means [disp32] so we handle it - // by emitting a displacement of 0 below. + // If the base is not EBP/ESP/R12/R13 and there is no displacement, use + // simple indirect register encoding, this handles addresses like [EAX]. + // The encoding for [EBP] or[R13] with no displacement means [disp32] so we + // handle it by emitting a displacement of 0 later. if (BaseRegNo != N86::EBP) { - if (Disp.isImm() && Disp.getImm() == 0) { + if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp) { emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS); return; } @@ -552,24 +556,22 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, } // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. - if (Disp.isImm()) { - if (!HasEVEX && isDisp8(Disp.getImm())) { - emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS); - emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups); - return; - } - // Try EVEX compressed 8-bit displacement first; if failed, fall back to - // 32-bit displacement. - int CDisp8 = 0; - if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + // Including a compressed disp8 for EVEX instructions that support it. + // This also handles the 0 displacement for [EBP] or [R13]. We can't use + // disp8 if the {disp32} pseudo prefix is present. + if (Disp.isImm() && AllowDisp8) { + int ImmOffset = 0; + if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS); emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups, - CDisp8 - Disp.getImm()); + ImmOffset); return; } } - // Otherwise, emit the most general non-SIB encoding: [REG+disp32] + // Otherwise, emit the most general non-SIB encoding: [REG+disp32]. + // Displacement may be 0 for [EBP] or [R13] case if {disp32} pseudo prefix + // prevented using disp8 above. emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS); unsigned Opcode = MI.getOpcode(); unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax @@ -585,64 +587,47 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, bool ForceDisp32 = false; bool ForceDisp8 = false; - int CDisp8 = 0; int ImmOffset = 0; if (BaseReg == 0) { // If there is no base register, we emit the special case SIB byte with // MOD=0, BASE=5, to JUST get the index, scale, and displacement. + BaseRegNo = 5; emitByte(modRMByte(0, RegOpcodeField, 4), OS); ForceDisp32 = true; - } else if (!Disp.isImm()) { - // Emit the normal disp32 encoding. - emitByte(modRMByte(2, RegOpcodeField, 4), OS); - ForceDisp32 = true; - } else if (Disp.getImm() == 0 && - // Base reg can't be anything that ends up with '5' as the base - // reg, it is the magic [*] nomenclature that indicates no base. + } else if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp && + // Base reg can't be EBP/RBP/R13 as that would end up with '5' as + // the base field, but that is the magic [*] nomenclature that + // indicates no base when mod=0. For these cases we'll emit a 0 + // displacement instead. BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte emitByte(modRMByte(0, RegOpcodeField, 4), OS); - } else if (!HasEVEX && isDisp8(Disp.getImm())) { - // Emit the disp8 encoding. + } else if (Disp.isImm() && AllowDisp8 && + isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { + // Displacement fits in a byte or matches an EVEX compressed disp8, use + // disp8 encoding. This also handles EBP/R13 base with 0 displacement unless + // {disp32} pseudo prefix was used. emitByte(modRMByte(1, RegOpcodeField, 4), OS); - ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP - } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { - // Emit the disp8 encoding. - emitByte(modRMByte(1, RegOpcodeField, 4), OS); - ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP - ImmOffset = CDisp8 - Disp.getImm(); + ForceDisp8 = true; } else { - // Emit the normal disp32 encoding. + // Otherwise, emit the normal disp32 encoding. emitByte(modRMByte(2, RegOpcodeField, 4), OS); + ForceDisp32 = true; } // Calculate what the SS field value should be... static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3}; unsigned SS = SSTable[Scale.getImm()]; - if (BaseReg == 0) { - // Handle the SIB byte for the case where there is no base, see Intel - // Manual 2A, table 2-7. The displacement has already been output. - unsigned IndexRegNo; - if (IndexReg.getReg()) - IndexRegNo = getX86RegNum(IndexReg); - else // Examples: [ESP+1*+4] or [scaled idx]+disp32 (MOD=0,BASE=5) - IndexRegNo = 4; - emitSIBByte(SS, IndexRegNo, 5, OS); - } else { - unsigned IndexRegNo; - if (IndexReg.getReg()) - IndexRegNo = getX86RegNum(IndexReg); - else - IndexRegNo = 4; // For example [ESP+1*+4] - emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), OS); - } + unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4; + + emitSIBByte(SS, IndexRegNo, BaseRegNo, OS); // Do we need to output a displacement? if (ForceDisp8) emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups, ImmOffset); - else if (ForceDisp32 || Disp.getImm() != 0) + else if (ForceDisp32) emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), StartByte, OS, Fixups); } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 8a478354cb168..8679bafa088e1 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -44,8 +44,10 @@ using namespace llvm; std::string X86_MC::ParseX86Triple(const Triple &TT) { std::string FS; - if (TT.getArch() == Triple::x86_64) - FS = "+64bit-mode,-32bit-mode,-16bit-mode"; + // SSE2 should default to enabled in 64-bit mode, but can be turned off + // explicitly. + if (TT.isArch64Bit()) + FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2"; else if (TT.getEnvironment() != Triple::CODE16) FS = "-64bit-mode,+32bit-mode,-16bit-mode"; else diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 91ba4e3d091e8..d4ad10d79bab2 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -67,9 +67,6 @@ FunctionPass *createX86OptimizeLEAs(); /// Return a pass that transforms setcc + movzx pairs into xor + setcc. FunctionPass *createX86FixupSetCC(); -/// Return a pass that folds conditional branch jumps. -FunctionPass *createX86CondBrFolding(); - /// Return a pass that avoids creating store forward block issues in the hardware. FunctionPass *createX86AvoidStoreForwardingBlocks(); @@ -154,7 +151,6 @@ void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86AvoidTrailingCallPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); -void initializeX86CondBrFoldingPassPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86ExpandPseudoPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 69d83ab1528cb..332033b8fe257 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -499,12 +499,6 @@ def FeatureUseGLMDivSqrtCosts : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true", "Use Goldmont specific floating point div/sqrt costs">; -// Merge branches using three-way conditional code. -def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", - "ThreewayBranchProfitable", "true", - "Merge branches to a three-way " - "conditional branch">; - // Enable use of alias analysis during code generation. def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", "Use alias analysis during codegen">; @@ -553,58 +547,49 @@ include "X86SchedSkylakeServer.td" def ProcessorFeatures { // Nehalem - list NHMInheritableFeatures = [FeatureX87, - FeatureCMPXCHG8B, - FeatureCMOV, - FeatureMMX, - FeatureSSE42, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePOPCNT, - FeatureLAHFSAHF, - FeatureMacroFusion, - FeatureInsertVZEROUPPER]; - list NHMSpecificFeatures = []; - list NHMFeatures = - !listconcat(NHMInheritableFeatures, NHMSpecificFeatures); + list NHMFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureLAHFSAHF]; + list NHMTuning = [FeatureMacroFusion, + FeatureInsertVZEROUPPER]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; - list WSMSpecificFeatures = []; - list WSMInheritableFeatures = - !listconcat(NHMInheritableFeatures, WSMAdditionalFeatures); + list WSMTuning = NHMTuning; list WSMFeatures = - !listconcat(WSMInheritableFeatures, WSMSpecificFeatures); + !listconcat(NHMFeatures, WSMAdditionalFeatures); // Sandybridge list SNBAdditionalFeatures = [FeatureAVX, - FeatureSlowDivide64, FeatureXSAVE, - FeatureXSAVEOPT, - FeatureSlow3OpsLEA, - FeatureFastScalarFSQRT, - FeatureFastSHLDRotate, - FeatureMergeToThreeWayBranch, - FeatureFast15ByteNOP]; - list SNBSpecificFeatures = [FeatureSlowUAMem32, - FeaturePOPCNTFalseDeps]; - list SNBInheritableFeatures = - !listconcat(WSMInheritableFeatures, SNBAdditionalFeatures); + FeatureXSAVEOPT]; + list SNBTuning = [FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureSlowUAMem32, + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list SNBFeatures = - !listconcat(SNBInheritableFeatures, SNBSpecificFeatures); + !listconcat(WSMFeatures, SNBAdditionalFeatures); // Ivybridge list IVBAdditionalFeatures = [FeatureRDRAND, FeatureF16C, FeatureFSGSBase]; - list IVBSpecificFeatures = [FeatureSlowUAMem32, - FeaturePOPCNTFalseDeps]; - list IVBInheritableFeatures = - !listconcat(SNBInheritableFeatures, IVBAdditionalFeatures); + list IVBTuning = SNBTuning; list IVBFeatures = - !listconcat(IVBInheritableFeatures, IVBSpecificFeatures); + !listconcat(SNBFeatures, IVBAdditionalFeatures); // Haswell list HSWAdditionalFeatures = [FeatureAVX2, @@ -614,77 +599,89 @@ def ProcessorFeatures { FeatureFMA, FeatureINVPCID, FeatureLZCNT, - FeatureMOVBE, - FeatureFastVariableShuffle]; - list HSWSpecificFeatures = [FeaturePOPCNTFalseDeps, - FeatureLZCNTFalseDeps]; - list HSWInheritableFeatures = - !listconcat(IVBInheritableFeatures, HSWAdditionalFeatures); + FeatureMOVBE]; + list HSWTuning = [FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeatureFastVariableShuffle, + FeaturePOPCNTFalseDeps, + FeatureLZCNTFalseDeps, + FeatureInsertVZEROUPPER]; list HSWFeatures = - !listconcat(HSWInheritableFeatures, HSWSpecificFeatures); + !listconcat(IVBFeatures, HSWAdditionalFeatures); // Broadwell list BDWAdditionalFeatures = [FeatureADX, FeatureRDSEED, FeaturePRFCHW]; - list BDWSpecificFeatures = [FeaturePOPCNTFalseDeps, - FeatureLZCNTFalseDeps]; - list BDWInheritableFeatures = - !listconcat(HSWInheritableFeatures, BDWAdditionalFeatures); + list BDWTuning = HSWTuning; list BDWFeatures = - !listconcat(BDWInheritableFeatures, BDWSpecificFeatures); + !listconcat(HSWFeatures, BDWAdditionalFeatures); // Skylake list SKLAdditionalFeatures = [FeatureAES, FeatureXSAVEC, FeatureXSAVES, FeatureCLFLUSHOPT, - FeatureFastVectorFSQRT]; - list SKLSpecificFeatures = [FeatureHasFastGather, - FeaturePOPCNTFalseDeps, - FeatureSGX]; - list SKLInheritableFeatures = - !listconcat(BDWInheritableFeatures, SKLAdditionalFeatures); + FeatureSGX]; + list SKLTuning = [FeatureHasFastGather, + FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureFastScalarFSQRT, + FeatureFastVectorFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeatureFastVariableShuffle, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list SKLFeatures = - !listconcat(SKLInheritableFeatures, SKLSpecificFeatures); + !listconcat(BDWFeatures, SKLAdditionalFeatures); // Skylake-AVX512 - list SKXAdditionalFeatures = [FeatureAVX512, - FeaturePrefer256Bit, + list SKXAdditionalFeatures = [FeatureAES, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT, + FeatureAVX512, FeatureCDI, FeatureDQI, FeatureBWI, FeatureVLX, FeaturePKU, FeatureCLWB]; - list SKXSpecificFeatures = [FeatureHasFastGather, - FeaturePOPCNTFalseDeps]; - list SKXInheritableFeatures = - !listconcat(SKLInheritableFeatures, SKXAdditionalFeatures); + list SKXTuning = [FeatureHasFastGather, + FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureFastScalarFSQRT, + FeatureFastVectorFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeatureFastVariableShuffle, + FeaturePrefer256Bit, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list SKXFeatures = - !listconcat(SKXInheritableFeatures, SKXSpecificFeatures); + !listconcat(BDWFeatures, SKXAdditionalFeatures); // Cascadelake list CLXAdditionalFeatures = [FeatureVNNI]; - list CLXSpecificFeatures = [FeatureHasFastGather, - FeaturePOPCNTFalseDeps]; - list CLXInheritableFeatures = - !listconcat(SKXInheritableFeatures, CLXAdditionalFeatures); + list CLXTuning = SKXTuning; list CLXFeatures = - !listconcat(CLXInheritableFeatures, CLXSpecificFeatures); + !listconcat(SKXFeatures, CLXAdditionalFeatures); // Cooperlake list CPXAdditionalFeatures = [FeatureBF16]; - list CPXSpecificFeatures = [FeatureHasFastGather, - FeaturePOPCNTFalseDeps]; - list CPXInheritableFeatures = - !listconcat(CLXInheritableFeatures, CPXAdditionalFeatures); + list CPXTuning = SKXTuning; list CPXFeatures = - !listconcat(CPXInheritableFeatures, CPXSpecificFeatures); + !listconcat(CLXFeatures, CPXAdditionalFeatures); // Cannonlake list CNLAdditionalFeatures = [FeatureAVX512, - FeaturePrefer256Bit, FeatureCDI, FeatureDQI, FeatureBWI, @@ -692,13 +689,20 @@ def ProcessorFeatures { FeaturePKU, FeatureVBMI, FeatureIFMA, - FeatureSHA, - FeatureSGX]; - list CNLSpecificFeatures = [FeatureHasFastGather]; - list CNLInheritableFeatures = - !listconcat(SKLInheritableFeatures, CNLAdditionalFeatures); + FeatureSHA]; + list CNLTuning = [FeatureHasFastGather, + FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureFastScalarFSQRT, + FeatureFastVectorFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeatureFastVariableShuffle, + FeaturePrefer256Bit, + FeatureInsertVZEROUPPER]; list CNLFeatures = - !listconcat(CNLInheritableFeatures, CNLSpecificFeatures); + !listconcat(SKLFeatures, CNLAdditionalFeatures); // Icelake list ICLAdditionalFeatures = [FeatureBITALG, @@ -710,71 +714,65 @@ def ProcessorFeatures { FeatureGFNI, FeatureCLWB, FeatureRDPID]; - list ICLSpecificFeatures = [FeatureHasFastGather]; - list ICLInheritableFeatures = - !listconcat(CNLInheritableFeatures, ICLAdditionalFeatures); + list ICLTuning = CNLTuning; list ICLFeatures = - !listconcat(ICLInheritableFeatures, ICLSpecificFeatures); + !listconcat(CNLFeatures, ICLAdditionalFeatures); // Icelake Server - list ICXSpecificFeatures = [FeaturePCONFIG, - FeatureWBNOINVD, - FeatureHasFastGather]; + list ICXAdditionalFeatures = [FeaturePCONFIG, + FeatureWBNOINVD]; + list ICXTuning = CNLTuning; list ICXFeatures = - !listconcat(ICLInheritableFeatures, ICXSpecificFeatures); + !listconcat(ICLFeatures, ICXAdditionalFeatures); //Tigerlake list TGLAdditionalFeatures = [FeatureVP2INTERSECT, FeatureMOVDIRI, FeatureMOVDIR64B, FeatureSHSTK]; - list TGLSpecificFeatures = [FeatureHasFastGather]; - list TGLInheritableFeatures = - !listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures); + list TGLTuning = CNLTuning; list TGLFeatures = - !listconcat(ICLFeatures, TGLInheritableFeatures ); + !listconcat(ICLFeatures, TGLAdditionalFeatures ); // Atom - list AtomInheritableFeatures = [FeatureX87, - FeatureCMPXCHG8B, - FeatureCMOV, - FeatureMMX, - FeatureSSSE3, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeatureSlowTwoMemOps, - FeatureLAHFSAHF, - FeatureInsertVZEROUPPER]; - list AtomSpecificFeatures = [ProcIntelAtom, - FeatureSlowUAMem16, - FeatureLEAForSP, - FeatureSlowDivide32, - FeatureSlowDivide64, - FeatureLEAUsesAG, - FeaturePadShortFunctions]; - list AtomFeatures = - !listconcat(AtomInheritableFeatures, AtomSpecificFeatures); + list AtomFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureLAHFSAHF]; + list AtomTuning = [ProcIntelAtom, + FeatureSlowUAMem16, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureSlowTwoMemOps, + FeatureLEAUsesAG, + FeaturePadShortFunctions, + FeatureInsertVZEROUPPER]; // Silvermont list SLMAdditionalFeatures = [FeatureSSE42, FeaturePOPCNT, FeaturePCLMUL, FeaturePRFCHW, - FeatureSlowLEA, - FeatureSlowIncDec, FeatureRDRAND]; - list SLMSpecificFeatures = [ProcIntelSLM, - FeatureSlowDivide64, - FeatureSlowPMULLD, - FeatureFast7ByteNOP, - FeaturePOPCNTFalseDeps]; - list SLMInheritableFeatures = - !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures); + list SLMTuning = [ProcIntelSLM, + FeatureSlowTwoMemOps, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowDivide64, + FeatureSlowPMULLD, + FeatureFast7ByteNOP, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list SLMFeatures = - !listconcat(SLMInheritableFeatures, SLMSpecificFeatures); + !listconcat(AtomFeatures, SLMAdditionalFeatures); // Goldmont list GLMAdditionalFeatures = [FeatureAES, @@ -786,31 +784,33 @@ def ProcessorFeatures { FeatureXSAVES, FeatureCLFLUSHOPT, FeatureFSGSBase]; - list GLMSpecificFeatures = [FeatureUseGLMDivSqrtCosts, - FeaturePOPCNTFalseDeps]; - list GLMInheritableFeatures = - !listconcat(SLMInheritableFeatures, GLMAdditionalFeatures); + list GLMTuning = [FeatureUseGLMDivSqrtCosts, + FeatureSlowTwoMemOps, + FeatureSlowLEA, + FeatureSlowIncDec, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list GLMFeatures = - !listconcat(GLMInheritableFeatures, GLMSpecificFeatures); + !listconcat(SLMFeatures, GLMAdditionalFeatures); // Goldmont Plus list GLPAdditionalFeatures = [FeaturePTWRITE, FeatureRDPID, FeatureSGX]; - list GLPSpecificFeatures = [FeatureUseGLMDivSqrtCosts]; - list GLPInheritableFeatures = - !listconcat(GLMInheritableFeatures, GLPAdditionalFeatures); + list GLPTuning = [FeatureUseGLMDivSqrtCosts, + FeatureSlowTwoMemOps, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureInsertVZEROUPPER]; list GLPFeatures = - !listconcat(GLPInheritableFeatures, GLPSpecificFeatures); + !listconcat(GLMFeatures, GLPAdditionalFeatures); // Tremont list TRMAdditionalFeatures = [FeatureCLWB, FeatureGFNI]; - list TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts]; - list TRMInheritableFeatures = - !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures); + list TRMTuning = GLPTuning; list TRMFeatures = - !listconcat(TRMInheritableFeatures, TRMSpecificFeatures); + !listconcat(GLPFeatures, TRMAdditionalFeatures); // Knights Landing list KNLFeatures = [FeatureX87, @@ -822,13 +822,10 @@ def ProcessorFeatures { Feature64Bit, FeatureCMPXCHG16B, FeaturePOPCNT, - FeatureSlowDivide64, FeaturePCLMUL, FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, - FeatureSlow3OpsLEA, - FeatureSlowIncDec, FeatureAES, FeatureRDRAND, FeatureF16C, @@ -845,56 +842,56 @@ def ProcessorFeatures { FeatureBMI, FeatureBMI2, FeatureFMA, - FeaturePRFCHW, - FeaturePreferMaskRegisters, - FeatureSlowTwoMemOps, - FeatureHasFastGather, - FeatureSlowPMADDWD]; + FeaturePRFCHW]; + list KNLTuning = [FeatureSlowDivide64, + FeatureSlow3OpsLEA, + FeatureSlowIncDec, + FeatureSlowTwoMemOps, + FeaturePreferMaskRegisters, + FeatureHasFastGather, + FeatureSlowPMADDWD]; // TODO Add AVX5124FMAPS/AVX5124VNNIW features list KNMFeatures = !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]); // Barcelona - list BarcelonaInheritableFeatures = [FeatureX87, - FeatureCMPXCHG8B, - FeatureSSE4A, - Feature3DNowA, - FeatureFXSR, - FeatureNOPL, - FeatureCMPXCHG16B, - FeaturePRFCHW, - FeatureLZCNT, - FeaturePOPCNT, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureCMOV, - Feature64Bit, - FeatureFastScalarShiftMasks, - FeatureInsertVZEROUPPER]; - list BarcelonaFeatures = BarcelonaInheritableFeatures; + list BarcelonaFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureSSE4A, + Feature3DNowA, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureLAHFSAHF, + FeatureCMOV, + Feature64Bit]; + list BarcelonaTuning = [FeatureFastScalarShiftMasks, + FeatureSlowSHLD, + FeatureInsertVZEROUPPER]; // Bobcat - list BtVer1InheritableFeatures = [FeatureX87, - FeatureCMPXCHG8B, - FeatureCMOV, - FeatureMMX, - FeatureSSSE3, - FeatureSSE4A, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePRFCHW, - FeatureLZCNT, - FeaturePOPCNT, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast15ByteNOP, - FeatureFastScalarShiftMasks, - FeatureFastVectorShiftMasks]; - list BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER]; - list BtVer1Features = - !listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures); + list BtVer1Features = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureLAHFSAHF]; + list BtVer1Tuning = [FeatureFast15ByteNOP, + FeatureFastScalarShiftMasks, + FeatureFastVectorShiftMasks, + FeatureSlowSHLD, + FeatureInsertVZEROUPPER]; // Jaguar list BtVer2AdditionalFeatures = [FeatureAVX, @@ -905,38 +902,39 @@ def ProcessorFeatures { FeatureMOVBE, FeatureXSAVE, FeatureXSAVEOPT]; - list BtVer2SpecificFeatures = [FeatureFastLZCNT, - FeatureFastBEXTR, - FeatureFastHorizontalOps]; - list BtVer2InheritableFeatures = - !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures); + list BtVer2Tuning = [FeatureFastLZCNT, + FeatureFastBEXTR, + FeatureFastHorizontalOps, + FeatureFast15ByteNOP, + FeatureFastScalarShiftMasks, + FeatureFastVectorShiftMasks, + FeatureSlowSHLD]; list BtVer2Features = - !listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures); + !listconcat(BtVer1Features, BtVer2AdditionalFeatures); // Bulldozer - list BdVer1InheritableFeatures = [FeatureX87, - FeatureCMPXCHG8B, - FeatureCMOV, - FeatureXOP, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureMMX, - FeatureFXSR, - FeatureNOPL, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureLWP, - FeatureSlowSHLD, - FeatureLAHFSAHF, - FeatureFast11ByteNOP, - FeatureFastScalarShiftMasks, - FeatureBranchFusion, - FeatureInsertVZEROUPPER]; - list BdVer1Features = BdVer1InheritableFeatures; + list BdVer1Features = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureXOP, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureFXSR, + FeatureNOPL, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureLWP, + FeatureLAHFSAHF]; + list BdVer1Tuning = [FeatureSlowSHLD, + FeatureFast11ByteNOP, + FeatureFastScalarShiftMasks, + FeatureBranchFusion, + FeatureInsertVZEROUPPER]; // PileDriver list BdVer2AdditionalFeatures = [FeatureF16C, @@ -944,16 +942,16 @@ def ProcessorFeatures { FeatureTBM, FeatureFMA, FeatureFastBEXTR]; - list BdVer2InheritableFeatures = - !listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures); - list BdVer2Features = BdVer2InheritableFeatures; + list BdVer2Tuning = BdVer1Tuning; + list BdVer2Features = + !listconcat(BdVer1Features, BdVer2AdditionalFeatures); // Steamroller list BdVer3AdditionalFeatures = [FeatureXSAVEOPT, FeatureFSGSBase]; - list BdVer3InheritableFeatures = - !listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures); - list BdVer3Features = BdVer3InheritableFeatures; + list BdVer3Tuning = BdVer2Tuning; + list BdVer3Features = + !listconcat(BdVer2Features, BdVer3AdditionalFeatures); // Excavator list BdVer4AdditionalFeatures = [FeatureAVX2, @@ -961,9 +959,9 @@ def ProcessorFeatures { FeatureMOVBE, FeatureRDRAND, FeatureMWAITX]; - list BdVer4InheritableFeatures = - !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures); - list BdVer4Features = BdVer4InheritableFeatures; + list BdVer4Tuning = BdVer3Tuning; + list BdVer4Features = + !listconcat(BdVer3Features, BdVer4AdditionalFeatures); // AMD Zen Processors common ISAs @@ -982,13 +980,8 @@ def ProcessorFeatures { FeatureFSGSBase, FeatureFXSR, FeatureNOPL, - FeatureFastLZCNT, FeatureLAHFSAHF, FeatureLZCNT, - FeatureFastBEXTR, - FeatureFast15ByteNOP, - FeatureBranchFusion, - FeatureFastScalarShiftMasks, FeatureMMX, FeatureMOVBE, FeatureMWAITX, @@ -999,16 +992,22 @@ def ProcessorFeatures { FeatureRDSEED, FeatureSHA, FeatureSSE4A, - FeatureSlowSHLD, - FeatureInsertVZEROUPPER, FeatureX87, FeatureXSAVE, FeatureXSAVEC, FeatureXSAVEOPT, FeatureXSAVES]; + list ZNTuning = [FeatureFastLZCNT, + FeatureFastBEXTR, + FeatureFast15ByteNOP, + FeatureBranchFusion, + FeatureFastScalarShiftMasks, + FeatureSlowSHLD, + FeatureInsertVZEROUPPER]; list ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, FeatureWBNOINVD]; + list ZN2Tuning = ZNTuning; list ZN2Features = !listconcat(ZNFeatures, ZN2AdditionalFeatures); } @@ -1017,42 +1016,47 @@ def ProcessorFeatures { // X86 processors supported. //===----------------------------------------------------------------------===// -class Proc Features> - : ProcessorModel; +class Proc Features, + list TuneFeatures> + : ProcessorModel; + +class ProcModel Features, + list TuneFeatures> + : ProcessorModel; // NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled // if i386/i486 is specifically requested. // NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget // constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled. // It has no effect on code generation. -def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B, Feature64Bit, - FeatureInsertVZEROUPPER]>; -def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16, - FeatureInsertVZEROUPPER]>; -def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16, - FeatureInsertVZEROUPPER]>; -def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; -def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; -def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B, FeatureMMX, - FeatureInsertVZEROUPPER]>; - -def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureCMOV, FeatureInsertVZEROUPPER]>; -def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>; - -def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureMMX, FeatureCMOV, FeatureFXSR, - FeatureNOPL, FeatureInsertVZEROUPPER]>; +def : Proc<"generic", [FeatureX87, FeatureCMPXCHG8B, Feature64Bit], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"i386", [FeatureX87], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"i486", [FeatureX87], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + +def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, + FeatureNOPL], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + +def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV, + FeatureFXSR, FeatureNOPL], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; foreach P = ["pentium3", "pentium3m"] in { - def : Proc; + def : Proc; } // Enable the PostRAScheduler for SSE2 and SSE3 class cpus. @@ -1065,45 +1069,45 @@ foreach P = ["pentium3", "pentium3m"] in { // measure to avoid performance surprises, in case clang's default cpu // changes slightly. -def : ProcessorModel<"pentium-m", GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, - FeatureCMOV, FeatureInsertVZEROUPPER]>; +def : ProcModel<"pentium-m", GenericPostRAModel, + [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureNOPL, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; foreach P = ["pentium4", "pentium4m"] in { -// def : ProcessorModel; +// def : ProcModel; +// [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; // Since 'pentium4' is the default 32-bit CPU on Linux and Windows, // give it more modern tunings. // FIXME: This wouldn't be needed if we supported mtune. - def : ProcessorModel; + def : ProcModel; } // Intel Quark. -def : Proc<"lakemont", [FeatureInsertVZEROUPPER]>; +def : Proc<"lakemont", [FeatureCMPXCHG8B], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; // Intel Core Duo. -def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, - FeatureCMOV, FeatureInsertVZEROUPPER]>; +def : ProcModel<"yonah", SandyBridgeModel, + [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureNOPL, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; // NetBurst. -def : ProcessorModel<"prescott", GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, - FeatureCMOV, FeatureInsertVZEROUPPER]>; -def : ProcessorModel<"nocona", GenericPostRAModel, [ +def : ProcModel<"prescott", GenericPostRAModel, + [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureNOPL, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : ProcModel<"nocona", GenericPostRAModel, [ FeatureX87, - FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, @@ -1112,13 +1116,15 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureNOPL, Feature64Bit, FeatureCMPXCHG16B, +], +[ + FeatureSlowUAMem16, FeatureInsertVZEROUPPER ]>; // Intel Core 2 Solo/Duo. -def : ProcessorModel<"core2", SandyBridgeModel, [ +def : ProcModel<"core2", SandyBridgeModel, [ FeatureX87, - FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, @@ -1127,13 +1133,15 @@ def : ProcessorModel<"core2", SandyBridgeModel, [ FeatureNOPL, Feature64Bit, FeatureCMPXCHG16B, - FeatureLAHFSAHF, + FeatureLAHFSAHF +], +[ FeatureMacroFusion, + FeatureSlowUAMem16, FeatureInsertVZEROUPPER ]>; -def : ProcessorModel<"penryn", SandyBridgeModel, [ +def : ProcModel<"penryn", SandyBridgeModel, [ FeatureX87, - FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, @@ -1142,140 +1150,165 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureNOPL, Feature64Bit, FeatureCMPXCHG16B, - FeatureLAHFSAHF, + FeatureLAHFSAHF +], +[ FeatureMacroFusion, + FeatureSlowUAMem16, FeatureInsertVZEROUPPER ]>; // Atom CPUs. foreach P = ["bonnell", "atom"] in { - def : ProcessorModel; + def : ProcModel; } foreach P = ["silvermont", "slm"] in { - def : ProcessorModel; + def : ProcModel; } -def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>; -def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>; -def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>; +def : ProcModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures, + ProcessorFeatures.GLMTuning>; +def : ProcModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures, + ProcessorFeatures.GLPTuning>; +def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures, + ProcessorFeatures.TRMTuning>; // "Arrandale" along with corei3 and corei5 foreach P = ["nehalem", "corei7"] in { - def : ProcessorModel; + def : ProcModel; } // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -def : ProcessorModel<"westmere", SandyBridgeModel, - ProcessorFeatures.WSMFeatures>; +def : ProcModel<"westmere", SandyBridgeModel, ProcessorFeatures.WSMFeatures, + ProcessorFeatures.WSMTuning>; foreach P = ["sandybridge", "corei7-avx"] in { - def : ProcessorModel; + def : ProcModel; } foreach P = ["ivybridge", "core-avx-i"] in { - def : ProcessorModel; + def : ProcModel; } foreach P = ["haswell", "core-avx2"] in { - def : ProcessorModel; + def : ProcModel; } -def : ProcessorModel<"broadwell", BroadwellModel, - ProcessorFeatures.BDWFeatures>; +def : ProcModel<"broadwell", BroadwellModel, ProcessorFeatures.BDWFeatures, + ProcessorFeatures.BDWTuning>; -def : ProcessorModel<"skylake", SkylakeClientModel, - ProcessorFeatures.SKLFeatures>; +def : ProcModel<"skylake", SkylakeClientModel, ProcessorFeatures.SKLFeatures, + ProcessorFeatures.SKLTuning>; // FIXME: define KNL scheduler model -def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>; -def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>; +def : ProcModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures, + ProcessorFeatures.KNLTuning>; +def : ProcModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures, + ProcessorFeatures.KNLTuning>; foreach P = ["skylake-avx512", "skx"] in { - def : ProcessorModel; + def : ProcModel; } -def : ProcessorModel<"cascadelake", SkylakeServerModel, - ProcessorFeatures.CLXFeatures>; -def : ProcessorModel<"cooperlake", SkylakeServerModel, - ProcessorFeatures.CPXFeatures>; -def : ProcessorModel<"cannonlake", SkylakeServerModel, - ProcessorFeatures.CNLFeatures>; -def : ProcessorModel<"icelake-client", SkylakeServerModel, - ProcessorFeatures.ICLFeatures>; -def : ProcessorModel<"icelake-server", SkylakeServerModel, - ProcessorFeatures.ICXFeatures>; -def : ProcessorModel<"tigerlake", SkylakeServerModel, - ProcessorFeatures.TGLFeatures>; +def : ProcModel<"cascadelake", SkylakeServerModel, + ProcessorFeatures.CLXFeatures, ProcessorFeatures.CLXTuning>; +def : ProcModel<"cooperlake", SkylakeServerModel, + ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>; +def : ProcModel<"cannonlake", SkylakeServerModel, + ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>; +def : ProcModel<"icelake-client", SkylakeServerModel, + ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>; +def : ProcModel<"icelake-server", SkylakeServerModel, + ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>; +def : ProcModel<"tigerlake", SkylakeServerModel, + ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>; // AMD CPUs. -def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureMMX, FeatureInsertVZEROUPPER]>; -def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNow, FeatureInsertVZEROUPPER]>; -def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNow, FeatureInsertVZEROUPPER]>; +def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; foreach P = ["athlon", "athlon-tbird"] in { - def : Proc; + def : Proc; } foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { - def : Proc; + def : Proc; } foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { - def : Proc; + def : Proc; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { - def : Proc; + def : Proc; } foreach P = ["amdfam10", "barcelona"] in { - def : Proc; + def : Proc; } // Bobcat -def : Proc<"btver1", ProcessorFeatures.BtVer1Features>; +def : Proc<"btver1", ProcessorFeatures.BtVer1Features, + ProcessorFeatures.BtVer1Tuning>; // Jaguar -def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>; +def : ProcModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features, + ProcessorFeatures.BtVer2Tuning>; // Bulldozer -def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>; +def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features, + ProcessorFeatures.BdVer1Tuning>; // Piledriver -def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>; +def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features, + ProcessorFeatures.BdVer2Tuning>; // Steamroller -def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>; +def : Proc<"bdver3", ProcessorFeatures.BdVer3Features, + ProcessorFeatures.BdVer3Tuning>; // Excavator -def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>; - -def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>; -def : ProcessorModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features>; - -def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNowA, FeatureInsertVZEROUPPER]>; - -def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureInsertVZEROUPPER]>; -def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow, - FeatureInsertVZEROUPPER]>; -def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow, - FeatureInsertVZEROUPPER]>; -def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureMMX, FeatureSSE1, FeatureFXSR, - FeatureCMOV, FeatureInsertVZEROUPPER]>; +def : Proc<"bdver4", ProcessorFeatures.BdVer4Features, + ProcessorFeatures.BdVer4Tuning>; + +def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures, + ProcessorFeatures.ZNTuning>; +def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features, + ProcessorFeatures.ZN2Tuning>; + +def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + +def : Proc<"winchip-c6", [FeatureX87, FeatureMMX], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"winchip2", [FeatureX87, Feature3DNow], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"c3", [FeatureX87, Feature3DNow], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, + FeatureSSE1, FeatureFXSR, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -1287,7 +1320,7 @@ def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, // covers a huge swath of x86 processors. If there are specific scheduling // knobs which need to be tuned differently for AMD chips, we might consider // forming a common base for them. -def : ProcessorModel<"x86-64", SandyBridgeModel, [ +def : ProcModel<"x86-64", SandyBridgeModel, [ FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, @@ -1296,6 +1329,8 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [ FeatureFXSR, FeatureNOPL, Feature64Bit, +], +[ FeatureSlow3OpsLEA, FeatureSlowDivide64, FeatureSlowIncDec, diff --git a/llvm/lib/Target/X86/X86CondBrFolding.cpp b/llvm/lib/Target/X86/X86CondBrFolding.cpp deleted file mode 100644 index 7ede94664bf62..0000000000000 --- a/llvm/lib/Target/X86/X86CondBrFolding.cpp +++ /dev/null @@ -1,579 +0,0 @@ -//===---- X86CondBrFolding.cpp - optimize conditional branches ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This file defines a pass that optimizes condition branches on x86 by taking -// advantage of the three-way conditional code generated by compare -// instructions. -// Currently, it tries to hoisting EQ and NE conditional branch to a dominant -// conditional branch condition where the same EQ/NE conditional code is -// computed. An example: -// bb_0: -// cmp %0, 19 -// jg bb_1 -// jmp bb_2 -// bb_1: -// cmp %0, 40 -// jg bb_3 -// jmp bb_4 -// bb_4: -// cmp %0, 20 -// je bb_5 -// jmp bb_6 -// Here we could combine the two compares in bb_0 and bb_4 and have the -// following code: -// bb_0: -// cmp %0, 20 -// jg bb_1 -// jl bb_2 -// jmp bb_5 -// bb_1: -// cmp %0, 40 -// jg bb_3 -// jmp bb_6 -// For the case of %0 == 20 (bb_5), we eliminate two jumps, and the control -// height for bb_6 is also reduced. bb_4 is gone after the optimization. -// -// There are plenty of this code patterns, especially from the switch case -// lowing where we generate compare of "pivot-1" for the inner nodes in the -// binary search tree. -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrInfo.h" -#include "X86Subtarget.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/BranchProbability.h" - -using namespace llvm; - -#define DEBUG_TYPE "x86-condbr-folding" - -STATISTIC(NumFixedCondBrs, "Number of x86 condbr folded"); - -namespace { -class X86CondBrFoldingPass : public MachineFunctionPass { -public: - X86CondBrFoldingPass() : MachineFunctionPass(ID) { } - StringRef getPassName() const override { return "X86 CondBr Folding"; } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); - } - -public: - static char ID; -}; -} // namespace - -char X86CondBrFoldingPass::ID = 0; -INITIALIZE_PASS(X86CondBrFoldingPass, "X86CondBrFolding", "X86CondBrFolding", false, false) - -FunctionPass *llvm::createX86CondBrFolding() { - return new X86CondBrFoldingPass(); -} - -namespace { -// A class the stores the auxiliary information for each MBB. -struct TargetMBBInfo { - MachineBasicBlock *TBB; - MachineBasicBlock *FBB; - MachineInstr *BrInstr; - MachineInstr *CmpInstr; - X86::CondCode BranchCode; - unsigned SrcReg; - int CmpValue; - bool Modified; - bool CmpBrOnly; -}; - -// A class that optimizes the conditional branch by hoisting and merge CondCode. -class X86CondBrFolding { -public: - X86CondBrFolding(const X86InstrInfo *TII, - const MachineBranchProbabilityInfo *MBPI, - MachineFunction &MF) - : TII(TII), MBPI(MBPI), MF(MF) {} - bool optimize(); - -private: - const X86InstrInfo *TII; - const MachineBranchProbabilityInfo *MBPI; - MachineFunction &MF; - std::vector> MBBInfos; - SmallVector RemoveList; - - void optimizeCondBr(MachineBasicBlock &MBB, - SmallVectorImpl &BranchPath); - void replaceBrDest(MachineBasicBlock *MBB, MachineBasicBlock *OrigDest, - MachineBasicBlock *NewDest); - void fixupModifiedCond(MachineBasicBlock *MBB); - std::unique_ptr analyzeMBB(MachineBasicBlock &MBB); - static bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - int &CmpValue); - bool findPath(MachineBasicBlock *MBB, - SmallVectorImpl &BranchPath); - TargetMBBInfo *getMBBInfo(MachineBasicBlock *MBB) const { - return MBBInfos[MBB->getNumber()].get(); - } -}; -} // namespace - -// Find a valid path that we can reuse the CondCode. -// The resulted path (if return true) is stored in BranchPath. -// Return value: -// false: is no valid path is found. -// true: a valid path is found and the targetBB can be reached. -bool X86CondBrFolding::findPath( - MachineBasicBlock *MBB, SmallVectorImpl &BranchPath) { - TargetMBBInfo *MBBInfo = getMBBInfo(MBB); - assert(MBBInfo && "Expecting a candidate MBB"); - int CmpValue = MBBInfo->CmpValue; - - MachineBasicBlock *PredMBB = *MBB->pred_begin(); - MachineBasicBlock *SaveMBB = MBB; - while (PredMBB) { - TargetMBBInfo *PredMBBInfo = getMBBInfo(PredMBB); - if (!PredMBBInfo || PredMBBInfo->SrcReg != MBBInfo->SrcReg) - return false; - - assert(SaveMBB == PredMBBInfo->TBB || SaveMBB == PredMBBInfo->FBB); - bool IsFalseBranch = (SaveMBB == PredMBBInfo->FBB); - - X86::CondCode CC = PredMBBInfo->BranchCode; - assert(CC == X86::COND_L || CC == X86::COND_G || CC == X86::COND_E); - int PredCmpValue = PredMBBInfo->CmpValue; - bool ValueCmpTrue = ((CmpValue < PredCmpValue && CC == X86::COND_L) || - (CmpValue > PredCmpValue && CC == X86::COND_G) || - (CmpValue == PredCmpValue && CC == X86::COND_E)); - // Check if both the result of value compare and the branch target match. - if (!(ValueCmpTrue ^ IsFalseBranch)) { - LLVM_DEBUG(dbgs() << "Dead BB detected!\n"); - return false; - } - - BranchPath.push_back(PredMBB); - // These are the conditions on which we could combine the compares. - if ((CmpValue == PredCmpValue) || - (CmpValue == PredCmpValue - 1 && CC == X86::COND_L) || - (CmpValue == PredCmpValue + 1 && CC == X86::COND_G)) - return true; - - // If PredMBB has more than on preds, or not a pure cmp and br, we bailout. - if (PredMBB->pred_size() != 1 || !PredMBBInfo->CmpBrOnly) - return false; - - SaveMBB = PredMBB; - PredMBB = *PredMBB->pred_begin(); - } - return false; -} - -// Fix up any PHI node in the successor of MBB. -static void fixPHIsInSucc(MachineBasicBlock *MBB, MachineBasicBlock *OldMBB, - MachineBasicBlock *NewMBB) { - if (NewMBB == OldMBB) - return; - for (auto MI = MBB->instr_begin(), ME = MBB->instr_end(); - MI != ME && MI->isPHI(); ++MI) - for (unsigned i = 2, e = MI->getNumOperands() + 1; i != e; i += 2) { - MachineOperand &MO = MI->getOperand(i); - if (MO.getMBB() == OldMBB) - MO.setMBB(NewMBB); - } -} - -// Utility function to set branch probability for edge MBB->SuccMBB. -static inline bool setBranchProb(MachineBasicBlock *MBB, - MachineBasicBlock *SuccMBB, - BranchProbability Prob) { - auto MBBI = std::find(MBB->succ_begin(), MBB->succ_end(), SuccMBB); - if (MBBI == MBB->succ_end()) - return false; - MBB->setSuccProbability(MBBI, Prob); - return true; -} - -// Utility function to find the unconditional br instruction in MBB. -static inline MachineBasicBlock::iterator -findUncondBrI(MachineBasicBlock *MBB) { - return std::find_if(MBB->begin(), MBB->end(), [](MachineInstr &MI) -> bool { - return MI.getOpcode() == X86::JMP_1; - }); -} - -// Replace MBB's original successor, OrigDest, with NewDest. -// Also update the MBBInfo for MBB. -void X86CondBrFolding::replaceBrDest(MachineBasicBlock *MBB, - MachineBasicBlock *OrigDest, - MachineBasicBlock *NewDest) { - TargetMBBInfo *MBBInfo = getMBBInfo(MBB); - MachineInstr *BrMI; - if (MBBInfo->TBB == OrigDest) { - BrMI = MBBInfo->BrInstr; - MachineInstrBuilder MIB = - BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), TII->get(X86::JCC_1)) - .addMBB(NewDest).addImm(MBBInfo->BranchCode); - MBBInfo->TBB = NewDest; - MBBInfo->BrInstr = MIB.getInstr(); - } else { // Should be the unconditional jump stmt. - MachineBasicBlock::iterator UncondBrI = findUncondBrI(MBB); - BuildMI(*MBB, UncondBrI, MBB->findDebugLoc(UncondBrI), TII->get(X86::JMP_1)) - .addMBB(NewDest); - MBBInfo->FBB = NewDest; - BrMI = &*UncondBrI; - } - fixPHIsInSucc(NewDest, OrigDest, MBB); - BrMI->eraseFromParent(); - MBB->addSuccessor(NewDest); - setBranchProb(MBB, NewDest, MBPI->getEdgeProbability(MBB, OrigDest)); - MBB->removeSuccessor(OrigDest); -} - -// Change the CondCode and BrInstr according to MBBInfo. -void X86CondBrFolding::fixupModifiedCond(MachineBasicBlock *MBB) { - TargetMBBInfo *MBBInfo = getMBBInfo(MBB); - if (!MBBInfo->Modified) - return; - - MachineInstr *BrMI = MBBInfo->BrInstr; - X86::CondCode CC = MBBInfo->BranchCode; - MachineInstrBuilder MIB = BuildMI(*MBB, BrMI, MBB->findDebugLoc(BrMI), - TII->get(X86::JCC_1)) - .addMBB(MBBInfo->TBB).addImm(CC); - BrMI->eraseFromParent(); - MBBInfo->BrInstr = MIB.getInstr(); - - MachineBasicBlock::iterator UncondBrI = findUncondBrI(MBB); - BuildMI(*MBB, UncondBrI, MBB->findDebugLoc(UncondBrI), TII->get(X86::JMP_1)) - .addMBB(MBBInfo->FBB); - MBB->erase(UncondBrI); - MBBInfo->Modified = false; -} - -// -// Apply the transformation: -// RootMBB -1-> ... PredMBB -3-> MBB -5-> TargetMBB -// \-2-> \-4-> \-6-> FalseMBB -// ==> -// RootMBB -1-> ... PredMBB -7-> FalseMBB -// TargetMBB <-8-/ \-2-> \-4-> -// -// Note that PredMBB and RootMBB could be the same. -// And in the case of dead TargetMBB, we will not have TargetMBB and edge 8. -// -// There are some special handling where the RootMBB is COND_E in which case -// we directly short-cycle the brinstr. -// -void X86CondBrFolding::optimizeCondBr( - MachineBasicBlock &MBB, SmallVectorImpl &BranchPath) { - - X86::CondCode CC; - TargetMBBInfo *MBBInfo = getMBBInfo(&MBB); - assert(MBBInfo && "Expecting a candidate MBB"); - MachineBasicBlock *TargetMBB = MBBInfo->TBB; - BranchProbability TargetProb = MBPI->getEdgeProbability(&MBB, MBBInfo->TBB); - - // Forward the jump from MBB's predecessor to MBB's false target. - MachineBasicBlock *PredMBB = BranchPath.front(); - TargetMBBInfo *PredMBBInfo = getMBBInfo(PredMBB); - assert(PredMBBInfo && "Expecting a candidate MBB"); - if (PredMBBInfo->Modified) - fixupModifiedCond(PredMBB); - CC = PredMBBInfo->BranchCode; - // Don't do this if depth of BranchPath is 1 and PredMBB is of COND_E. - // We will short-cycle directly for this case. - if (!(CC == X86::COND_E && BranchPath.size() == 1)) - replaceBrDest(PredMBB, &MBB, MBBInfo->FBB); - - MachineBasicBlock *RootMBB = BranchPath.back(); - TargetMBBInfo *RootMBBInfo = getMBBInfo(RootMBB); - assert(RootMBBInfo && "Expecting a candidate MBB"); - if (RootMBBInfo->Modified) - fixupModifiedCond(RootMBB); - CC = RootMBBInfo->BranchCode; - - if (CC != X86::COND_E) { - MachineBasicBlock::iterator UncondBrI = findUncondBrI(RootMBB); - // RootMBB: Cond jump to the original not-taken MBB. - X86::CondCode NewCC; - switch (CC) { - case X86::COND_L: - NewCC = X86::COND_G; - break; - case X86::COND_G: - NewCC = X86::COND_L; - break; - default: - llvm_unreachable("unexpected condtional code."); - } - BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI), - TII->get(X86::JCC_1)) - .addMBB(RootMBBInfo->FBB).addImm(NewCC); - - // RootMBB: Jump to TargetMBB - BuildMI(*RootMBB, UncondBrI, RootMBB->findDebugLoc(UncondBrI), - TII->get(X86::JMP_1)) - .addMBB(TargetMBB); - RootMBB->addSuccessor(TargetMBB); - fixPHIsInSucc(TargetMBB, &MBB, RootMBB); - RootMBB->erase(UncondBrI); - } else { - replaceBrDest(RootMBB, RootMBBInfo->TBB, TargetMBB); - } - - // Fix RootMBB's CmpValue to MBB's CmpValue to TargetMBB. Don't set Imm - // directly. Move MBB's stmt to here as the opcode might be different. - if (RootMBBInfo->CmpValue != MBBInfo->CmpValue) { - MachineInstr *NewCmp = MBBInfo->CmpInstr; - NewCmp->removeFromParent(); - RootMBB->insert(RootMBBInfo->CmpInstr, NewCmp); - RootMBBInfo->CmpInstr->eraseFromParent(); - } - - // Fix branch Probabilities. - auto fixBranchProb = [&](MachineBasicBlock *NextMBB) { - BranchProbability Prob; - for (auto &I : BranchPath) { - MachineBasicBlock *ThisMBB = I; - if (!ThisMBB->hasSuccessorProbabilities() || - !ThisMBB->isSuccessor(NextMBB)) - break; - Prob = MBPI->getEdgeProbability(ThisMBB, NextMBB); - if (Prob.isUnknown()) - break; - TargetProb = Prob * TargetProb; - Prob = Prob - TargetProb; - setBranchProb(ThisMBB, NextMBB, Prob); - if (ThisMBB == RootMBB) { - setBranchProb(ThisMBB, TargetMBB, TargetProb); - } - ThisMBB->normalizeSuccProbs(); - if (ThisMBB == RootMBB) - break; - NextMBB = ThisMBB; - } - return true; - }; - if (CC != X86::COND_E && !TargetProb.isUnknown()) - fixBranchProb(MBBInfo->FBB); - - if (CC != X86::COND_E) - RemoveList.push_back(&MBB); - - // Invalidate MBBInfo just in case. - MBBInfos[MBB.getNumber()] = nullptr; - MBBInfos[RootMBB->getNumber()] = nullptr; - - LLVM_DEBUG(dbgs() << "After optimization:\nRootMBB is: " << *RootMBB << "\n"); - if (BranchPath.size() > 1) - LLVM_DEBUG(dbgs() << "PredMBB is: " << *(BranchPath[0]) << "\n"); -} - -// Driver function for optimization: find the valid candidate and apply -// the transformation. -bool X86CondBrFolding::optimize() { - bool Changed = false; - LLVM_DEBUG(dbgs() << "***** X86CondBr Folding on Function: " << MF.getName() - << " *****\n"); - // Setup data structures. - MBBInfos.resize(MF.getNumBlockIDs()); - for (auto &MBB : MF) - MBBInfos[MBB.getNumber()] = analyzeMBB(MBB); - - for (auto &MBB : MF) { - TargetMBBInfo *MBBInfo = getMBBInfo(&MBB); - if (!MBBInfo || !MBBInfo->CmpBrOnly) - continue; - if (MBB.pred_size() != 1) - continue; - LLVM_DEBUG(dbgs() << "Work on MBB." << MBB.getNumber() - << " CmpValue: " << MBBInfo->CmpValue << "\n"); - SmallVector BranchPath; - if (!findPath(&MBB, BranchPath)) - continue; - -#ifndef NDEBUG - LLVM_DEBUG(dbgs() << "Found one path (len=" << BranchPath.size() << "):\n"); - int Index = 1; - LLVM_DEBUG(dbgs() << "Target MBB is: " << MBB << "\n"); - for (auto I = BranchPath.rbegin(); I != BranchPath.rend(); ++I, ++Index) { - MachineBasicBlock *PMBB = *I; - TargetMBBInfo *PMBBInfo = getMBBInfo(PMBB); - LLVM_DEBUG(dbgs() << "Path MBB (" << Index << " of " << BranchPath.size() - << ") is " << *PMBB); - LLVM_DEBUG(dbgs() << "CC=" << PMBBInfo->BranchCode - << " Val=" << PMBBInfo->CmpValue - << " CmpBrOnly=" << PMBBInfo->CmpBrOnly << "\n\n"); - } -#endif - optimizeCondBr(MBB, BranchPath); - Changed = true; - } - NumFixedCondBrs += RemoveList.size(); - for (auto MBBI : RemoveList) { - while (!MBBI->succ_empty()) - MBBI->removeSuccessor(MBBI->succ_end() - 1); - - MBBI->eraseFromParent(); - } - - return Changed; -} - -// Analyze instructions that generate CondCode and extract information. -bool X86CondBrFolding::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - int &CmpValue) { - unsigned SrcRegIndex = 0; - unsigned ValueIndex = 0; - switch (MI.getOpcode()) { - // TODO: handle test instructions. - default: - return false; - case X86::CMP64ri32: - case X86::CMP64ri8: - case X86::CMP32ri: - case X86::CMP32ri8: - case X86::CMP16ri: - case X86::CMP16ri8: - case X86::CMP8ri: - SrcRegIndex = 0; - ValueIndex = 1; - break; - case X86::SUB64ri32: - case X86::SUB64ri8: - case X86::SUB32ri: - case X86::SUB32ri8: - case X86::SUB16ri: - case X86::SUB16ri8: - case X86::SUB8ri: - SrcRegIndex = 1; - ValueIndex = 2; - break; - } - SrcReg = MI.getOperand(SrcRegIndex).getReg(); - if (!MI.getOperand(ValueIndex).isImm()) - return false; - CmpValue = MI.getOperand(ValueIndex).getImm(); - return true; -} - -// Analyze a candidate MBB and set the extract all the information needed. -// The valid candidate will have two successors. -// It also should have a sequence of -// Branch_instr, -// CondBr, -// UnCondBr. -// Return TargetMBBInfo if MBB is a valid candidate and nullptr otherwise. -std::unique_ptr -X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) { - MachineBasicBlock *TBB; - MachineBasicBlock *FBB; - MachineInstr *BrInstr; - MachineInstr *CmpInstr; - X86::CondCode CC; - unsigned SrcReg; - int CmpValue; - bool Modified; - bool CmpBrOnly; - - if (MBB.succ_size() != 2) - return nullptr; - - CmpBrOnly = true; - FBB = TBB = nullptr; - CmpInstr = nullptr; - MachineBasicBlock::iterator I = MBB.end(); - while (I != MBB.begin()) { - --I; - if (I->isDebugValue()) - continue; - if (I->getOpcode() == X86::JMP_1) { - if (FBB) - return nullptr; - FBB = I->getOperand(0).getMBB(); - continue; - } - if (I->isBranch()) { - if (TBB) - return nullptr; - CC = X86::getCondFromBranch(*I); - switch (CC) { - default: - return nullptr; - case X86::COND_E: - case X86::COND_L: - case X86::COND_G: - case X86::COND_NE: - case X86::COND_LE: - case X86::COND_GE: - break; - } - TBB = I->getOperand(0).getMBB(); - BrInstr = &*I; - continue; - } - if (analyzeCompare(*I, SrcReg, CmpValue)) { - if (CmpInstr) - return nullptr; - CmpInstr = &*I; - continue; - } - CmpBrOnly = false; - break; - } - - if (!TBB || !FBB || !CmpInstr) - return nullptr; - - // Simplify CondCode. Note this is only to simplify the findPath logic - // and will not change the instruction here. - switch (CC) { - case X86::COND_NE: - CC = X86::COND_E; - std::swap(TBB, FBB); - Modified = true; - break; - case X86::COND_LE: - if (CmpValue == INT_MAX) - return nullptr; - CC = X86::COND_L; - CmpValue += 1; - Modified = true; - break; - case X86::COND_GE: - if (CmpValue == INT_MIN) - return nullptr; - CC = X86::COND_G; - CmpValue -= 1; - Modified = true; - break; - default: - Modified = false; - break; - } - return std::make_unique(TargetMBBInfo{ - TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly}); -} - -bool X86CondBrFoldingPass::runOnMachineFunction(MachineFunction &MF) { - const X86Subtarget &ST = MF.getSubtarget(); - if (!ST.threewayBranchProfitable()) - return false; - const X86InstrInfo *TII = ST.getInstrInfo(); - const MachineBranchProbabilityInfo *MBPI = - &getAnalysis(); - - X86CondBrFolding CondBr(TII, MBPI, MF); - return CondBr.optimize(); -} diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 3cd80cb04ab84..58424892535a7 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1005,6 +1005,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() { case ISD::STRICT_FFLOOR: case ISD::FTRUNC: case ISD::STRICT_FTRUNC: + case ISD::FROUNDEVEN: + case ISD::STRICT_FROUNDEVEN: case ISD::FNEARBYINT: case ISD::STRICT_FNEARBYINT: case ISD::FRINT: @@ -1020,6 +1022,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() { case ISD::FFLOOR: Imm = 0x9; break; case ISD::STRICT_FTRUNC: case ISD::FTRUNC: Imm = 0xB; break; + case ISD::STRICT_FROUNDEVEN: + case ISD::FROUNDEVEN: Imm = 0x8; break; case ISD::STRICT_FNEARBYINT: case ISD::FNEARBYINT: Imm = 0xC; break; case ISD::STRICT_FRINT: @@ -3940,67 +3944,169 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; - unsigned Opc1 = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - auto isLogicOp = [](unsigned Opc) { - return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || - Opc == X86ISD::ANDNP; + auto getFoldableLogicOp = [](SDValue Op) { + // Peek through single use bitcast. + if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) + Op = Op.getOperand(0); + + if (!Op.hasOneUse()) + return SDValue(); + + unsigned Opc = Op.getOpcode(); + if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || + Opc == X86ISD::ANDNP) + return Op; + + return SDValue(); }; - SDValue A, B, C; - unsigned Opc2; - if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) { - Opc2 = N1.getOpcode(); + SDValue A, FoldableOp; + if ((FoldableOp = getFoldableLogicOp(N1))) { A = N0; - B = N1.getOperand(0); - C = N1.getOperand(1); - } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) { - Opc2 = N0.getOpcode(); + } else if ((FoldableOp = getFoldableLogicOp(N0))) { A = N1; - B = N0.getOperand(0); - C = N0.getOperand(1); } else return false; - uint64_t Imm; - switch (Opc1) { + SDValue B = FoldableOp.getOperand(0); + SDValue C = FoldableOp.getOperand(1); + + // We can build the appropriate control immediate by performing the logic + // operation we're matching using these constants for A, B, and C. + const uint8_t TernlogMagicA = 0xf0; + const uint8_t TernlogMagicB = 0xcc; + const uint8_t TernlogMagicC = 0xaa; + + uint8_t Imm; + switch (FoldableOp.getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: - switch (Opc2) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: Imm = 0x80; break; - case ISD::OR: Imm = 0xe0; break; - case ISD::XOR: Imm = 0x60; break; - case X86ISD::ANDNP: Imm = 0x20; break; - } - break; - case ISD::OR: - switch (Opc2) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: Imm = 0xf8; break; - case ISD::OR: Imm = 0xfe; break; - case ISD::XOR: Imm = 0xf6; break; - case X86ISD::ANDNP: Imm = 0xf2; break; - } - break; - case ISD::XOR: - switch (Opc2) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: Imm = 0x78; break; - case ISD::OR: Imm = 0x1e; break; - case ISD::XOR: Imm = 0x96; break; - case X86ISD::ANDNP: Imm = 0xd2; break; - } - break; + case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; + case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; + case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; + case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; + } + + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: Imm &= TernlogMagicA; break; + case ISD::OR: Imm |= TernlogMagicA; break; + case ISD::XOR: Imm ^= TernlogMagicA; break; + } + + auto tryFoldLoadOrBCast = + [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, SDValue &Segment) { + if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) + return true; + + // Not a load, check for broadcast which may be behind a bitcast. + if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { + P = L.getNode(); + L = L.getOperand(0); + } + + if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) + return false; + + // Only 32 and 64 bit broadcasts are supported. + auto *MemIntr = cast(L); + unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); + if (Size != 32 && Size != 64) + return false; + + return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); + }; + + bool FoldedLoad = false; + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoadOrBCast(N, FoldableOp.getNode(), C, Tmp0, Tmp1, Tmp2, Tmp3, + Tmp4)) { + FoldedLoad = true; + } else if (tryFoldLoadOrBCast(N, N, A, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + FoldedLoad = true; + std::swap(A, C); + // Swap bits 1/4 and 3/6. + uint8_t OldImm = Imm; + Imm = OldImm & 0xa5; + if (OldImm & 0x02) Imm |= 0x10; + if (OldImm & 0x10) Imm |= 0x02; + if (OldImm & 0x08) Imm |= 0x40; + if (OldImm & 0x40) Imm |= 0x08; + } else if (tryFoldLoadOrBCast(N, FoldableOp.getNode(), B, Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { + FoldedLoad = true; + std::swap(B, C); + // Swap bits 1/2 and 5/6. + uint8_t OldImm = Imm; + Imm = OldImm & 0x99; + if (OldImm & 0x02) Imm |= 0x04; + if (OldImm & 0x04) Imm |= 0x02; + if (OldImm & 0x20) Imm |= 0x40; + if (OldImm & 0x40) Imm |= 0x20; } SDLoc DL(N); - SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C, - CurDAG->getTargetConstant(Imm, DL, MVT::i8)); - ReplaceNode(N, New.getNode()); - SelectCode(New.getNode()); + + SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8); + + MachineSDNode *MNode; + if (FoldedLoad) { + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + + unsigned Opc; + if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *MemIntr = cast(C); + unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); + assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!"); + + bool UseD = EltSize == 32; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; + else + llvm_unreachable("Unexpected vector size!"); + } else { + bool UseD = NVT.getVectorElementType() == MVT::i32; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; + else + llvm_unreachable("Unexpected vector size!"); + } + + SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)}; + MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); + + // Update the chain. + ReplaceUses(C.getValue(1), SDValue(MNode, 1)); + // Record the mem-refs + CurDAG->setNodeMemRefs(MNode, {cast(C)->getMemOperand()}); + } else { + bool UseD = NVT.getVectorElementType() == MVT::i32; + unsigned Opc; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; + else + llvm_unreachable("Unexpected vector size!"); + + MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm}); + } + + ReplaceUses(SDValue(N, 0), SDValue(MNode, 0)); + CurDAG->RemoveDeadNode(N); return true; } @@ -4101,15 +4207,15 @@ VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ VPTESTM_CASE(v64i8, BZ##SUFFIX) \ VPTESTM_CASE(v32i16, WZ##SUFFIX) - if (FoldedLoad) { + if (FoldedBCast) { switch (TestVT.SimpleTy) { - VPTESTM_FULL_CASES(rm) + VPTESTM_BROADCAST_CASES(rmb) } } - if (FoldedBCast) { + if (FoldedLoad) { switch (TestVT.SimpleTy) { - VPTESTM_BROADCAST_CASES(rmb) + VPTESTM_FULL_CASES(rm) } } @@ -4168,79 +4274,56 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, } } - // Without VLX we need to widen the load. + // Without VLX we need to widen the operation. bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); - // We can only fold loads if the sources are unique. - bool CanFoldLoads = Src0 != Src1; + auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, + SDValue &Base, SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + // If we need to widen, we can't fold the load. + if (!Widen) + if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) + return true; - // Try to fold loads unless we need to widen. - bool FoldedLoad = false; - SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load; - if (!Widen && CanFoldLoads) { - Load = Src1; - FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3, - Tmp4); - if (!FoldedLoad) { - // And is computative. - Load = Src0; - FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, - Tmp3, Tmp4); - if (FoldedLoad) - std::swap(Src0, Src1); - } - } + // If we didn't fold a load, try to match broadcast. No widening limitation + // for this. But only 32 and 64 bit types are supported. + if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) + return false; - auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { // Look through single use bitcasts. - if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) { - Parent = Src.getNode(); - Src = Src.getOperand(0); + if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { + P = L.getNode(); + L = L.getOperand(0); } - if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) { - auto *MemIntr = cast(Src); - if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits()) - return Src; - } + if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) + return false; - return SDValue(); + auto *MemIntr = cast(L); + if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) + return false; + + return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); }; - // If we didn't fold a load, try to match broadcast. No widening limitation - // for this. But only 32 and 64 bit types are supported. - bool FoldedBCast = false; - if (!FoldedLoad && CanFoldLoads && - (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { - SDNode *ParentNode = N0.getNode(); - if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); - } + // We can only fold loads if the sources are unique. + bool CanFoldLoads = Src0 != Src1; - // Try the other operand. - if (!FoldedBCast) { - SDNode *ParentNode = N0.getNode(); - if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); - if (FoldedBCast) - std::swap(Src0, Src1); - } + bool FoldedLoad = false; + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (CanFoldLoads) { + FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4); + if (!FoldedLoad) { + // And is commutative. + FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, + Tmp2, Tmp3, Tmp4); + if (FoldedLoad) + std::swap(Src0, Src1); } } - auto getMaskRC = [](MVT MaskVT) { - switch (MaskVT.SimpleTy) { - default: llvm_unreachable("Unexpected VT!"); - case MVT::v2i1: return X86::VK2RegClassID; - case MVT::v4i1: return X86::VK4RegClassID; - case MVT::v8i1: return X86::VK8RegClassID; - case MVT::v16i1: return X86::VK16RegClassID; - case MVT::v32i1: return X86::VK32RegClassID; - case MVT::v64i1: return X86::VK64RegClassID; - } - }; + bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; bool IsMasked = InMask.getNode() != nullptr; @@ -4259,13 +4342,12 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, CmpVT), 0); Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); - assert(!FoldedLoad && "Shouldn't have folded the load"); if (!FoldedBCast) Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); if (IsMasked) { // Widen the mask. - unsigned RegClass = getMaskRC(MaskVT); + unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID(); SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, MaskVT, InMask, RC), 0); @@ -4277,23 +4359,23 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, IsMasked); MachineSDNode *CNode; - if (FoldedLoad || FoldedBCast) { + if (FoldedLoad) { SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); if (IsMasked) { SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, - Load.getOperand(0) }; + Src1.getOperand(0) }; CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); } else { SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, - Load.getOperand(0) }; + Src1.getOperand(0) }; CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); } // Update the chain. - ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); + ReplaceUses(Src1.getValue(1), SDValue(CNode, 1)); // Record the mem-refs - CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); + CurDAG->setNodeMemRefs(CNode, {cast(Src1)->getMemOperand()}); } else { if (IsMasked) CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); @@ -4303,7 +4385,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, // If we widened, we need to shrink the mask VT. if (Widen) { - unsigned RegClass = getMaskRC(ResVT); + unsigned RegClass = TLI->getRegClassFor(ResVT)->getID(); SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, ResVT, SDValue(CNode, 0), RC); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2e97bef6c106a..79047c90ff999 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1081,6 +1081,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); + setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal); setOperationAction(ISD::FROUND, RoundedTy, Custom); } @@ -1175,6 +1177,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); @@ -1560,6 +1564,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); } @@ -2531,17 +2537,6 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { return TargetLowering::getSafeStackPointerLocation(IRB); } -bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, - unsigned DestAS) const { - assert(SrcAS != DestAS && "Expected different address spaces!"); - - const TargetMachine &TM = getTargetMachine(); - if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS)) - return false; - - return SrcAS < 256 && DestAS < 256; -} - //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -5619,6 +5614,11 @@ static bool scaleShuffleElements(ArrayRef Mask, unsigned NumDstElts, return false; } +static bool canScaleShuffleElements(ArrayRef Mask, unsigned NumDstElts) { + SmallVector WidenedMask; + return scaleShuffleElements(Mask, NumDstElts, WidenedMask); +} + /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); @@ -6686,7 +6686,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, return false; // Insert the extracted elements into the mask. - for (APInt Elt : EltBits) + for (const APInt &Elt : EltBits) RawMask.push_back(Elt.getZExtValue()); return true; @@ -7401,8 +7401,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. - SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); - SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1)); + SDValue N0 = peekThroughBitcasts(N.getOperand(0)); + SDValue N1 = peekThroughBitcasts(N.getOperand(1)); if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) return false; SmallVector SrcMask0, SrcMask1; @@ -7413,34 +7413,24 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, true)) return false; - // Shuffle inputs must be the same size as the result. - if (llvm::any_of(SrcInputs0, [VT](SDValue Op) { - return VT.getSizeInBits() != Op.getValueSizeInBits(); - })) - return false; - if (llvm::any_of(SrcInputs1, [VT](SDValue Op) { - return VT.getSizeInBits() != Op.getValueSizeInBits(); - })) - return false; - size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector Mask0, Mask1; narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); - for (size_t i = 0; i != MaskSize; ++i) { + for (int i = 0; i != (int)MaskSize; ++i) { if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) Mask.push_back(SM_SentinelUndef); else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) - Mask.push_back(Mask0[i]); + Mask.push_back(i); else if (Mask0[i] == SM_SentinelZero) - Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size())); + Mask.push_back(i + MaskSize); else return false; } - Ops.append(SrcInputs0.begin(), SrcInputs0.end()); - Ops.append(SrcInputs1.begin(), SrcInputs1.end()); + Ops.push_back(N0); + Ops.push_back(N1); return true; } case ISD::INSERT_SUBVECTOR: { @@ -7595,19 +7585,19 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, APInt EltsLHS, EltsRHS; getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); - // If we know input saturation won't happen we can treat this - // as a truncation shuffle. + // If we know input saturation won't happen (or we don't care for particular + // lanes), we can treat this as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { - if ((!N0.isUndef() && + if ((!(N0.isUndef() || EltsLHS.isNullValue()) && DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || - (!N1.isUndef() && + (!(N1.isUndef() || EltsRHS.isNullValue()) && DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); - if ((!N0.isUndef() && + if ((!(N0.isUndef() || EltsLHS.isNullValue()) && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || - (!N1.isUndef() && + (!(N1.isUndef() || EltsRHS.isNullValue()) && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) return false; } @@ -10938,6 +10928,15 @@ static unsigned getV4X86ShuffleImm(ArrayRef Mask) { assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); + // If the mask only uses one non-undef element, then fully 'splat' it to + // improve later broadcast matching. + int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin(); + assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask"); + + int FirstElt = Mask[FirstIndex]; + if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; })) + return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt; + unsigned Imm = 0; Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; @@ -13723,9 +13722,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); - int WidenedMask[4] = { - std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, - std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; + int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2), + Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1), + Mask[1] < 0 ? -1 : (Mask[1] * 2), + Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, @@ -15056,6 +15056,12 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); + // If we have XOP we can use one VPPERM instead of multiple PSHUFBs. + if (Subtarget.hasXOP()) { + SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true); + return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode); + } + // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. if (SDValue V = lowerShuffleAsByteRotateAndPermute( @@ -28103,8 +28109,8 @@ bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const { /// a) very likely accessed only by a single thread to minimize cache traffic, /// and b) definitely dereferenceable. Returns the new Chain result. static SDValue emitLockedStackOp(SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SDValue Chain, SDLoc DL) { + const X86Subtarget &Subtarget, SDValue Chain, + const SDLoc &DL) { // Implementation notes: // 1) LOCK prefix creates a full read/write reordering barrier for memory // operations issued by the current processor. As such, the location @@ -30953,6 +30959,34 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { // X86 Scheduler Hooks //===----------------------------------------------------------------------===// +// Returns true if EFLAG is consumed after this iterator in the rest of the +// basic block or any successors of the basic block. +static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, + MachineBasicBlock *BB) { + // Scan forward through BB for a use/def of EFLAGS. + for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end(); + miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(X86::EFLAGS)) + return true; + // If we found a def, we can stop searching. + if (mi.definesRegister(X86::EFLAGS)) + return false; + } + + // If we hit the end of the block, check whether EFLAGS is live into a + // successor. + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(X86::EFLAGS)) + return true; + } + + return false; +} + /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { @@ -30985,6 +31019,12 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, MF->insert(I, fallMBB); MF->insert(I, sinkMBB); + if (isEFLAGSLiveAfter(MI, MBB)) { + mainMBB->addLiveIn(X86::EFLAGS); + fallMBB->addLiveIn(X86::EFLAGS); + sinkMBB->addLiveIn(X86::EFLAGS); + } + // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); @@ -31373,27 +31413,8 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { - // Scan forward through BB for a use/def of EFLAGS. - MachineBasicBlock::iterator miI(std::next(SelectItr)); - for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { - const MachineInstr& mi = *miI; - if (mi.readsRegister(X86::EFLAGS)) - return false; - if (mi.definesRegister(X86::EFLAGS)) - break; // Should have kill-flag - update below. - } - - // If we hit the end of the block, check whether EFLAGS is live into a - // successor. - if (miI == BB->end()) { - for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), - sEnd = BB->succ_end(); - sItr != sEnd; ++sItr) { - MachineBasicBlock* succ = *sItr; - if (succ->isLiveIn(X86::EFLAGS)) - return false; - } - } + if (isEFLAGSLiveAfter(SelectItr, BB)) + return false; // We found a def, or hit the end of the basic block and EFLAGS wasn't live // out. SelectMI should have a kill flag on EFLAGS. @@ -34115,7 +34136,10 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef Mask, } // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. - if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { ArrayRef LoMask(RepeatedMask.data() + 0, 4); @@ -34185,6 +34209,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary) { + unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { @@ -34242,6 +34267,46 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef Mask, } } + // Attempt to match against a OR if we're performing a blend shuffle and the + // non-blended source element is zero in each case. + if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && + (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) { + bool IsBlend = true; + unsigned NumV1Elts = V1.getValueType().getVectorNumElements(); + unsigned NumV2Elts = V2.getValueType().getVectorNumElements(); + unsigned Scale1 = NumV1Elts / NumMaskElts; + unsigned Scale2 = NumV2Elts / NumMaskElts; + APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts); + APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + if (M == SM_SentinelZero) { + DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); + DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); + continue; + } + if (M == (int)i) { + DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); + continue; + } + if (M == (int)(i + NumMaskElts)) { + DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); + continue; + } + IsBlend = false; + break; + } + if (IsBlend && + DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && + DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { + Shuffle = ISD::OR; + SrcVT = DstVT = EVT(MaskVT).changeTypeToInteger().getSimpleVT(); + return true; + } + } + return false; } @@ -34430,6 +34495,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, assert((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"); + MVT RootVT = Root.getSimpleValueType(); + unsigned RootSizeInBits = RootVT.getSizeInBits(); + unsigned NumRootElts = RootVT.getVectorNumElements(); + // Find the inputs that enter the chain. Note that multiple uses are OK // here, we're not going to remove the operands we find. bool UnaryShuffle = (Inputs.size() == 1); @@ -34439,10 +34508,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); - MVT RootVT = Root.getSimpleValueType(); - assert(VT1.getSizeInBits() == RootVT.getSizeInBits() && - VT2.getSizeInBits() == RootVT.getSizeInBits() && - "Vector size mismatch"); + assert(VT1.getSizeInBits() == RootSizeInBits && + VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch"); SDLoc DL(Root); SDValue Res; @@ -34454,8 +34521,6 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } bool OptForSize = DAG.shouldOptForSize(); - unsigned RootSizeInBits = RootVT.getSizeInBits(); - unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || (RootVT.isFloatingPoint() && Depth >= 1) || @@ -35346,6 +35411,9 @@ static SDValue combineX86ShufflesRecursively( assert(RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"); + assert(Root.getSimpleValueType().isVector() && + "Shuffles operate on vector types!"); + unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits(); // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. @@ -35361,9 +35429,6 @@ static SDValue combineX86ShufflesRecursively( if (!VT.isVector()) return SDValue(); // Bail if we hit a non-vector. - assert(Root.getSimpleValueType().isVector() && - "Shuffles operate on vector types!"); - unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits(); assert(VT.getSizeInBits() == RootSizeInBits && "Can only combine shuffles of the same vector register size."); @@ -36113,15 +36178,16 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } // vbroadcast(vector load X) -> vbroadcast_load - if (SrcVT == MVT::v2f64 && Src.hasOneUse() && - ISD::isNormalLoad(Src.getNode())) { + if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 || + SrcVT == MVT::v4i32) && + Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) { LoadSDNode *LN = cast(Src); // Unless the load is volatile or atomic. if (LN->isSimple()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode( - X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64, + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(), LN->getPointerInfo(), LN->getOriginalAlign(), LN->getMemOperand()->getFlags()); DCI.CombineTo(N.getNode(), BcastLd); @@ -38263,7 +38329,7 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, // a vector/float/double that got truncated/extended/bitcast to/from a scalar // integer. If so, replace the scalar ops with bool vector equivalents back down // the chain. -static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL, +static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -38316,6 +38382,10 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL, case ISD::SHL: { // If we find a suitable source, a SHL becomes a KSHIFTL. SDValue Src0 = V.getOperand(0); + if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) || + ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI())) + break; + if (auto *Amt = dyn_cast(V.getOperand(1))) if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget)) return DAG.getNode( @@ -39039,10 +39109,12 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, // Handle extract(truncate(x)) for 0'th index. // TODO: Treat this as a faux shuffle? // TODO: When can we use this for general indices? - if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) { + if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 && + (SrcVT.getSizeInBits() % 128) == 0) { Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl); - Src = DAG.getBitcast(SrcVT, Src); - return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx); + MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits); + return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src), + Idx); } // Resolve the target shuffle inputs and mask. @@ -42707,31 +42779,46 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, // Turn it into series of XORs and a setnp. static SDValue combineParity(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT VT = N->getValueType(0); - - // We only support 64-bit and 32-bit. 64-bit requires special handling - // unless the 64-bit popcnt instruction is legal. - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT)) + // RHS needs to be 1. + if (!isOneConstant(N1)) return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); + // Popcnt may be truncated. + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) + N0 = N0.getOperand(0); // LHS needs to be a single use CTPOP. if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse()) return SDValue(); - // RHS needs to be 1. - if (!isOneConstant(N1)) + EVT VT = N0.getValueType(); + + // We only support 64-bit and 32-bit. 64-bit requires special handling + // unless the 64-bit popcnt instruction is legal. + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT)) return SDValue(); SDLoc DL(N); SDValue X = N0.getOperand(0); + // Special case. If the input fits in 8-bits we can use a single 8-bit TEST. + if (DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) { + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); + SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X, + DAG.getConstant(0, DL, MVT::i8)); + // Copy the inverse of the parity flag into a register with setcc. + SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); + // Extend or truncate to the original type. + return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0)); + } + // If this is 64-bit, its always best to xor the two 32-bit pieces together // even if we have popcnt. if (VT == MVT::i64) { @@ -42745,7 +42832,7 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG, SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32, DAG.getNode(ISD::CTPOP, DL, MVT::i32, X), DAG.getConstant(1, DL, MVT::i32)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity); + return DAG.getZExtOrTrunc(Parity, DL, N->getValueType(0)); } assert(VT == MVT::i32 && "Unexpected VT!"); @@ -42766,8 +42853,8 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG, // Copy the inverse of the parity flag into a register with setcc. SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); - // Zero extend to original type. - return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); + // Extend or truncate to the original type. + return DAG.getZExtOrTrunc(Setnp, DL, N->getValueType(0)); } @@ -42935,7 +43022,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (VT == SrcVecVT.getScalarType() && N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) && getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) && - llvm::all_of(EltBits, [](APInt M) { + llvm::all_of(EltBits, [](const APInt &M) { return M.isNullValue() || M.isAllOnesValue(); })) { unsigned NumElts = SrcVecVT.getVectorNumElements(); @@ -44355,8 +44442,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - bool IsCommutative) { + const X86Subtarget &Subtarget, bool IsCommutative, + SmallVectorImpl &PostShuffleMask) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -44459,6 +44546,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, if (!(A == C && B == D)) return false; + PostShuffleMask.clear(); + PostShuffleMask.append(NumElts, SM_SentinelUndef); + // LHS and RHS are now: // LHS = shuffle A, B, LMask // RHS = shuffle A, B, RMask @@ -44467,6 +44557,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, // so we just repeat the inner loop if this is a 256-bit op. unsigned Num128BitChunks = VT.getSizeInBits() / 128; unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; + unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; assert((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"); for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { @@ -44478,25 +44569,48 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; + // Check that successive odd/even elements are being operated on. If not, + // this is not a horizontal operation. + if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) && + !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative)) + return false; + + // Compute the post-shuffle mask index based on where the element + // is stored in the HOP result, and where it needs to be moved to. + int Base = LIdx & ~1u; + int Index = ((Base % NumEltsPer128BitChunk) / 2) + + ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1)); + // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. - unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; - unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; - - // Check that successive elements are being operated on. If not, this is - // not a horizontal operation. - int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j; - if (!(LIdx == Index && RIdx == Index + 1) && - !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) - return false; + if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk)) + Index += NumEltsPer64BitChunk; + PostShuffleMask[i + j] = Index; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. - if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) + bool IsIdentityPostShuffle = + isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); + if (IsIdentityPostShuffle) + PostShuffleMask.clear(); + + // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split), unless + // the shuffle can widen to shuffle entire lanes, which should still be quick. + if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() && + isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), + PostShuffleMask) && + !canScaleShuffleElements(PostShuffleMask, 2)) + return false; + + // Assume a SingleSource HOP if we only shuffle one input and don't need to + // shuffle the result. + if (!shouldUseHorizontalOp(LHS == RHS && + (NumShuffles < 2 || !IsIdentityPostShuffle), + DAG, Subtarget)) return false; LHS = DAG.getBitcast(VT, LHS); @@ -44515,10 +44629,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); // Try to synthesize horizontal add/sub from adds/subs of shuffles. + SmallVector PostShuffleMask; if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) - return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) { + SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; + } // NOTE: isHorizontalBinOp may have changed LHS/RHS variables. @@ -47620,17 +47740,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG, bool IsAdd = N->getOpcode() == ISD::ADD; assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"); + SmallVector PostShuffleMask; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && - isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) { + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) { auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, - DL, Ops[0].getValueType(), Ops); + return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL, + Ops[0].getValueType(), Ops); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, - HOpBuilder); + SDValue HorizBinOp = + SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; } return SDValue(); @@ -48015,6 +48140,25 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, Op0.getOperand(1)); } break; + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case X86ISD::ANDNP: + // TODO: Add 256-bit support. + if (!IsSplat && VT.is512BitVector()) { + SmallVector LHS, RHS; + for (unsigned i = 0; i != NumOps; ++i) { + LHS.push_back(Ops[i].getOperand(0)); + RHS.push_back(Ops[i].getOperand(1)); + } + MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumOps * SrcVT.getVectorNumElements()); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS), + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS)); + } + break; case X86ISD::PACKSS: case X86ISD::PACKUS: if (!IsSplat && NumOps == 2 && VT.is256BitVector() && @@ -48277,12 +48421,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, unsigned IdxVal = N->getConstantOperandVal(1); SDValue InVecBC = peekThroughBitcasts(InVec); EVT InVecVT = InVec.getValueType(); + unsigned SizeInBits = VT.getSizeInBits(); + unsigned InSizeInBits = InVecVT.getSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) && - InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) { - auto isConcatenatedNot = [] (SDValue V) { + InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) { + auto isConcatenatedNot = [](SDValue V) { V = peekThroughBitcasts(V); if (!isBitwiseNot(V)) return false; @@ -48325,7 +48471,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 && InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) && ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) && - InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) { + InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) { SDLoc DL(N); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), @@ -48336,14 +48482,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // broadcasting to the smaller type directly, assuming this is the only use. // As its a broadcast we don't care about the extraction index. if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() && - InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) + InVec.getOperand(0).getValueSizeInBits() <= SizeInBits) return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) { auto *MemIntr = cast(InVec); - if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) { + if (MemIntr->getMemoryVT().getSizeInBits() <= SizeInBits) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, MemIntr->getMemoryVT(), @@ -48358,7 +48504,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // SimplifyDemandedVectorElts do more simplifications. if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST || InVec.getOpcode() == X86ISD::VBROADCAST_LOAD)) - return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits()); + return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); // If we're extracting a broadcasted subvector, just use the source. if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST && @@ -48366,12 +48512,12 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return InVec.getOperand(0); // Attempt to extract from the source of a shuffle vector. - if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 && + if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % VT.getVectorNumElements()) == 0) { SmallVector ShuffleMask; SmallVector ScaledMask; SmallVector ShuffleInputs; - unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits(); + unsigned NumSubVecs = InSizeInBits / SizeInBits; // Decode the shuffle mask and scale it so its shuffling subvectors. if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) && scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) { @@ -48381,11 +48527,11 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (ScaledMask[SubVecIdx] == SM_SentinelZero) return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs]; - if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) { + if (Src.getValueSizeInBits() == InSizeInBits) { unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs; unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements(); return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG, - SDLoc(N), VT.getSizeInBits()); + SDLoc(N), SizeInBits); } } } @@ -48417,10 +48563,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || InOpcode == ISD::SIGN_EXTEND || InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && - VT.is128BitVector() && - InVec.getOperand(0).getSimpleValueType().is128BitVector()) { + (SizeInBits == 128 || SizeInBits == 256) && + InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) { + SDLoc DL(N); + SDValue Ext = InVec.getOperand(0); + if (Ext.getValueSizeInBits() > SizeInBits) + Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits); unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode); - return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0)); + return DAG.getNode(ExtOp, DL, VT, Ext); } if (InOpcode == ISD::VSELECT && InVec.getOperand(0).getValueType().is256BitVector() && @@ -48432,6 +48582,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } + if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() && + (VT.is128BitVector() || VT.is256BitVector())) { + SDLoc DL(N); + SDValue InVecSrc = InVec.getOperand(0); + unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits; + SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits); + return DAG.getNode(InOpcode, DL, VT, Ext); + } } return SDValue(); @@ -48548,6 +48706,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); + unsigned Opcode = N->getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Try to merge vector loads and extend_inreg to an extload. @@ -48556,7 +48715,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, auto *Ld = cast(In); if (Ld->isSimple()) { MVT SVT = In.getSimpleValueType().getVectorElementType(); - ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG + ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; EVT MemVT = @@ -48564,8 +48723,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { SDValue Load = DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), MemVT, - Ld->getOriginalAlign(), + Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); return Load; @@ -48574,8 +48732,9 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, } // Attempt to combine as a shuffle. - // TODO: SSE41 support - if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { + // TODO: SSE ZERO_EXTEND_VECTOR_INREG support. + if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG || + (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasAVX())) { SDValue Op(N, 0); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) @@ -49482,8 +49641,8 @@ LowerXConstraint(EVT ConstraintVT) const { // Lower @cc targets via setcc. SDValue X86TargetLowering::LowerAsmOutputForConstraint( - SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo, - SelectionDAG &DAG) const { + SDValue &Chain, SDValue &Flag, const SDLoc &DL, + const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); if (Cond == X86::COND_INVALID) return SDValue(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 7f3dc90a2d735..55611b6bfc9f8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1116,7 +1116,8 @@ namespace llvm { } /// Handle Lowering flag assembly outputs. - SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL, + SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, + const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override; @@ -1349,8 +1350,6 @@ namespace llvm { Align Alignment, SelectionDAG &DAG) const; - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; - /// Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 91341f93a9943..3632305aa5cbf 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -202,7 +202,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, auto Vec = II.getArgOperand(0); auto Amt = II.getArgOperand(1); - auto VT = cast(Vec->getType()); + auto VT = cast(Vec->getType()); auto SVT = VT->getElementType(); auto AmtVT = Amt->getType(); unsigned VWidth = VT->getNumElements(); @@ -234,7 +234,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && cast(AmtVT)->getElementType() == SVT && "Unexpected shift-by-scalar type"); - unsigned NumAmtElts = cast(AmtVT)->getNumElements(); + unsigned NumAmtElts = cast(AmtVT)->getNumElements(); APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); KnownBits KnownLowerBits = llvm::computeKnownBits( @@ -350,7 +350,7 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, auto Vec = II.getArgOperand(0); auto Amt = II.getArgOperand(1); - auto VT = cast(II.getType()); + auto VT = cast(II.getType()); auto SVT = VT->getElementType(); int NumElts = VT->getNumElements(); int BitWidth = SVT->getIntegerBitWidth(); @@ -377,7 +377,7 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, SmallVector ShiftAmts; for (int I = 0; I < NumElts; ++I) { auto *CElt = CShift->getAggregateElement(I); - if (CElt && isa(CElt)) { + if (isa_and_nonnull(CElt)) { ShiftAmts.push_back(-1); continue; } @@ -448,10 +448,10 @@ static Value *simplifyX86pack(IntrinsicInst &II, if (isa(Arg0) && isa(Arg1)) return UndefValue::get(ResTy); - auto *ArgTy = cast(Arg0->getType()); + auto *ArgTy = cast(Arg0->getType()); unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; unsigned NumSrcElts = ArgTy->getNumElements(); - assert(cast(ResTy)->getNumElements() == (2 * NumSrcElts) && + assert(cast(ResTy)->getNumElements() == (2 * NumSrcElts) && "Unexpected packing types"); unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; @@ -513,7 +513,7 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II, if (isa(Arg)) return Constant::getNullValue(ResTy); - auto *ArgTy = dyn_cast(Arg->getType()); + auto *ArgTy = dyn_cast(Arg->getType()); // We can't easily peek through x86_mmx types. if (!ArgTy) return nullptr; @@ -567,7 +567,7 @@ static Value *simplifyX86insertps(const IntrinsicInst &II, if (!CInt) return nullptr; - VectorType *VecTy = cast(II.getType()); + auto *VecTy = cast(II.getType()); assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); // The immediate permute control byte looks like this: @@ -810,7 +810,7 @@ static Value *simplifyX86pshufb(const IntrinsicInst &II, if (!V) return nullptr; - auto *VecTy = cast(II.getType()); + auto *VecTy = cast(II.getType()); unsigned NumElts = VecTy->getNumElements(); assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && "Unexpected number of elements in shuffle mask!"); @@ -855,7 +855,7 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II, if (!V) return nullptr; - auto *VecTy = cast(II.getType()); + auto *VecTy = cast(II.getType()); unsigned NumElts = VecTy->getNumElements(); bool IsPD = VecTy->getScalarType()->isDoubleTy(); unsigned NumLaneElts = IsPD ? 2 : 4; @@ -903,7 +903,7 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II, if (!V) return nullptr; - auto *VecTy = cast(II.getType()); + auto *VecTy = cast(II.getType()); unsigned Size = VecTy->getNumElements(); assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && "Unexpected shuffle mask size"); @@ -1084,7 +1084,7 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // These intrinsics only demand the 0th element of their input vectors. If // we can simplify the input based on that, do so now. Value *Arg = II.getArgOperand(0); - unsigned VWidth = cast(Arg->getType())->getNumElements(); + unsigned VWidth = cast(Arg->getType())->getNumElements(); if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { return IC.replaceOperand(II, 0, V); } @@ -1136,7 +1136,7 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { bool MadeChange = false; Value *Arg0 = II.getArgOperand(0); Value *Arg1 = II.getArgOperand(1); - unsigned VWidth = cast(Arg0->getType())->getNumElements(); + unsigned VWidth = cast(Arg0->getType())->getNumElements(); if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { IC.replaceOperand(II, 0, V); MadeChange = true; @@ -1362,7 +1362,7 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Value *Arg1 = II.getArgOperand(1); assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && "Unexpected packed shift size"); - unsigned VWidth = cast(Arg1->getType())->getNumElements(); + unsigned VWidth = cast(Arg1->getType())->getNumElements(); if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { return IC.replaceOperand(II, 1, V); @@ -1433,7 +1433,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { bool MadeChange = false; Value *Arg0 = II.getArgOperand(0); Value *Arg1 = II.getArgOperand(1); - unsigned VWidth = cast(Arg0->getType())->getNumElements(); + unsigned VWidth = + cast(Arg0->getType())->getNumElements(); APInt UndefElts1(VWidth, 0); APInt DemandedElts1 = @@ -1476,8 +1477,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { case Intrinsic::x86_sse4a_extrq: { Value *Op0 = II.getArgOperand(0); Value *Op1 = II.getArgOperand(1); - unsigned VWidth0 = cast(Op0->getType())->getNumElements(); - unsigned VWidth1 = cast(Op1->getType())->getNumElements(); + unsigned VWidth0 = cast(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast(Op1->getType())->getNumElements(); assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && VWidth1 == 16 && "Unexpected operand sizes"); @@ -1517,7 +1518,7 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining // bits of the lower 64-bits. The upper 64-bits are undefined. Value *Op0 = II.getArgOperand(0); - unsigned VWidth = cast(Op0->getType())->getNumElements(); + unsigned VWidth = cast(Op0->getType())->getNumElements(); assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && "Unexpected operand size"); @@ -1541,10 +1542,10 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { case Intrinsic::x86_sse4a_insertq: { Value *Op0 = II.getArgOperand(0); Value *Op1 = II.getArgOperand(1); - unsigned VWidth = cast(Op0->getType())->getNumElements(); + unsigned VWidth = cast(Op0->getType())->getNumElements(); assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && - cast(Op1->getType())->getNumElements() == 2 && + cast(Op1->getType())->getNumElements() == 2 && "Unexpected operand size"); // See if we're dealing with constant values. @@ -1577,8 +1578,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // undefined. Value *Op0 = II.getArgOperand(0); Value *Op1 = II.getArgOperand(1); - unsigned VWidth0 = cast(Op0->getType())->getNumElements(); - unsigned VWidth1 = cast(Op1->getType())->getNumElements(); + unsigned VWidth0 = cast(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast(Op1->getType())->getNumElements(); assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && VWidth1 == 2 && "Unexpected operand sizes"); @@ -1650,9 +1651,9 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { "Not expecting mask and operands with different sizes"); unsigned NumMaskElts = - cast(Mask->getType())->getNumElements(); + cast(Mask->getType())->getNumElements(); unsigned NumOperandElts = - cast(II.getType())->getNumElements(); + cast(II.getType())->getNumElements(); if (NumMaskElts == NumOperandElts) { return SelectInst::Create(BoolVec, Op1, Op0); } @@ -1768,7 +1769,7 @@ Optional X86TTIImpl::simplifyDemandedUseBitsIntrinsic( ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. } else { auto Arg = II.getArgOperand(0); - auto ArgType = cast(Arg->getType()); + auto ArgType = cast(Arg->getType()); ArgWidth = ArgType->getNumElements(); } @@ -1785,10 +1786,6 @@ Optional X86TTIImpl::simplifyDemandedUseBitsIntrinsic( KnownBitsComputed = true; break; } - case Intrinsic::x86_sse42_crc32_64_64: - Known.Zero.setBitsFrom(32); - KnownBitsComputed = true; - break; } return None; } @@ -1938,7 +1935,7 @@ Optional X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( case Intrinsic::x86_avx512_packusdw_512: case Intrinsic::x86_avx512_packuswb_512: { auto *Ty0 = II.getArgOperand(0)->getType(); - unsigned InnerVWidth = cast(Ty0)->getNumElements(); + unsigned InnerVWidth = cast(Ty0)->getNumElements(); assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index f7f22285bd15d..e83e1e74ff526 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -1182,6 +1182,15 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, X86sub_flag, sub, 0, 1, 0>; } +// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of +// __builtin_parity where the last step xors an h-register with an l-register. +let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst", + Defs = [EFLAGS], isCommutable = 1 in +def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst), + (ins GR8_NOREX:$src1, GR8_NOREX:$src2), + "xor{b}\t{$src2, $dst|$dst, $src2}", []>, + Sched<[WriteALU]>; + // Arithmetic. defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, 1, 0>; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 4df93fb2ed60b..d78d9f7c80c76 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1698,6 +1698,16 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>, Requires<[In64BitMode]>; +// Special pattern to catch the last step of __builtin_parity handling. Our +// goal is to use an xor of an h-register with the corresponding l-register. +// The above patterns would handle this on non 64-bit targets, but for 64-bit +// we need to be more careful. We're using a NOREX instruction here in case +// register allocation fails to keep the two registers together. So we need to +// make sure we can't accidentally mix R8-R15 with an h-register. +def : Pat<(X86xor_flag (i8 (trunc GR32:$src)), + (i8 (trunc (srl_su GR32:$src, (i8 8))))), + (XOR8rr_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit), + (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; // (shl x, 1) ==> (add x, x) // Note that if x is undef (immediate or otherwise), we could theoretically diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 42c1111735701..c753880fc92cd 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -28,9 +28,9 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -6675,6 +6675,18 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } +bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + + // ENDBR instructions should not be scheduled around. + unsigned Opcode = MI.getOpcode(); + if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32) + return true; + + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); +} + bool X86InstrInfo:: reverseBranchCondition(SmallVectorImpl &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 89f2ff118c378..c345a8217168d 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -409,6 +409,13 @@ class X86InstrInfo final : public X86GenInstrInfo { bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override; + /// isSchedulingBoundary - Overrides the isSchedulingBoundary from + /// Codegen/TargetInstrInfo.cpp to make it capable of identifying ENDBR + /// intructions and prevent it from being re-scheduled. + bool isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads /// should be scheduled togther. On some targets if two loads are loading from diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp index 84f560f2f9ee7..96c9a8580f8f4 100644 --- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp @@ -70,6 +70,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizerInfoAVX512DQ(); setLegalizerInfoAVX512BW(); + getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN) + .scalarize(0) + .minScalar(0, LLT::scalar(32)) + .libcall(); + setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1); for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR}) setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1); diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 50f8b3477acce..18fcc48bc9cdd 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -42,6 +42,7 @@ #include "X86TargetMachine.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -104,9 +105,9 @@ static cl::opt EmitDotVerify( cl::init(false), cl::Hidden); static llvm::sys::DynamicLibrary OptimizeDL; -typedef int (*OptimizeCutT)(unsigned int *nodes, unsigned int nodes_size, - unsigned int *edges, int *edge_values, - int *cut_edges /* out */, unsigned int edges_size); +typedef int (*OptimizeCutT)(unsigned int *Nodes, unsigned int NodesSize, + unsigned int *Edges, int *EdgeValues, + int *CutEdges /* out */, unsigned int EdgesSize); static OptimizeCutT OptimizeCut = nullptr; namespace { @@ -148,9 +149,10 @@ class X86LoadValueInjectionLoadHardeningPass : public MachineFunctionPass { private: using GraphBuilder = ImmutableGraphBuilder; + using Edge = MachineGadgetGraph::Edge; + using Node = MachineGadgetGraph::Node; using EdgeSet = MachineGadgetGraph::EdgeSet; using NodeSet = MachineGadgetGraph::NodeSet; - using Gadget = std::pair; const X86Subtarget *STI; const TargetInstrInfo *TII; @@ -162,8 +164,8 @@ class X86LoadValueInjectionLoadHardeningPass : public MachineFunctionPass { const MachineDominanceFrontier &MDF) const; int hardenLoadsWithPlugin(MachineFunction &MF, std::unique_ptr Graph) const; - int hardenLoadsWithGreedyHeuristic( - MachineFunction &MF, std::unique_ptr Graph) const; + int hardenLoadsWithHeuristic(MachineFunction &MF, + std::unique_ptr Graph) const; int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */, NodeSet &ElimNodes /* in, out */) const; @@ -198,7 +200,7 @@ struct DOTGraphTraits : DefaultDOTGraphTraits { using ChildIteratorType = typename Traits::ChildIteratorType; using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType; - DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} std::string getNodeLabel(NodeRef Node, GraphType *) { if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel) @@ -243,7 +245,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( AU.setPreservesCFG(); } -static void WriteGadgetGraph(raw_ostream &OS, MachineFunction &MF, +static void writeGadgetGraph(raw_ostream &OS, MachineFunction &MF, MachineGadgetGraph *G) { WriteGraph(OS, G, /*ShortNames*/ false, "Speculative gadgets for \"" + MF.getName() + "\" function"); @@ -279,7 +281,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( return false; // didn't find any gadgets if (EmitDotVerify) { - WriteGadgetGraph(outs(), MF, Graph.get()); + writeGadgetGraph(outs(), MF, Graph.get()); return false; } @@ -292,7 +294,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( raw_fd_ostream FileOut(FileName, FileError); if (FileError) errs() << FileError.message(); - WriteGadgetGraph(FileOut, MF, Graph.get()); + writeGadgetGraph(FileOut, MF, Graph.get()); FileOut.close(); LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n"); if (EmitDotOnly) @@ -313,7 +315,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( } FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph)); } else { // Use the default greedy heuristic - FencesInserted = hardenLoadsWithGreedyHeuristic(MF, std::move(Graph)); + FencesInserted = hardenLoadsWithHeuristic(MF, std::move(Graph)); } if (FencesInserted > 0) @@ -540,17 +542,17 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( // Returns the number of remaining gadget edges that could not be eliminated int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( - MachineGadgetGraph &G, MachineGadgetGraph::EdgeSet &ElimEdges /* in, out */, - MachineGadgetGraph::NodeSet &ElimNodes /* in, out */) const { + MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */, + NodeSet &ElimNodes /* in, out */) const { if (G.NumFences > 0) { // Eliminate fences and CFG edges that ingress and egress the fence, as // they are trivially mitigated. - for (const auto &E : G.edges()) { - const MachineGadgetGraph::Node *Dest = E.getDest(); + for (const Edge &E : G.edges()) { + const Node *Dest = E.getDest(); if (isFence(Dest->getValue())) { ElimNodes.insert(*Dest); ElimEdges.insert(E); - for (const auto &DE : Dest->edges()) + for (const Edge &DE : Dest->edges()) ElimEdges.insert(DE); } } @@ -558,29 +560,28 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( // Find and eliminate gadget edges that have been mitigated. int MitigatedGadgets = 0, RemainingGadgets = 0; - MachineGadgetGraph::NodeSet ReachableNodes{G}; - for (const auto &RootN : G.nodes()) { + NodeSet ReachableNodes{G}; + for (const Node &RootN : G.nodes()) { if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge)) continue; // skip this node if it isn't a gadget source // Find all of the nodes that are CFG-reachable from RootN using DFS ReachableNodes.clear(); - std::function - FindReachableNodes = - [&](const MachineGadgetGraph::Node *N, bool FirstNode) { - if (!FirstNode) - ReachableNodes.insert(*N); - for (const auto &E : N->edges()) { - const MachineGadgetGraph::Node *Dest = E.getDest(); - if (MachineGadgetGraph::isCFGEdge(E) && - !ElimEdges.contains(E) && !ReachableNodes.contains(*Dest)) - FindReachableNodes(Dest, false); - } - }; + std::function FindReachableNodes = + [&](const Node *N, bool FirstNode) { + if (!FirstNode) + ReachableNodes.insert(*N); + for (const Edge &E : N->edges()) { + const Node *Dest = E.getDest(); + if (MachineGadgetGraph::isCFGEdge(E) && !ElimEdges.contains(E) && + !ReachableNodes.contains(*Dest)) + FindReachableNodes(Dest, false); + } + }; FindReachableNodes(&RootN, true); // Any gadget whose sink is unreachable has been mitigated - for (const auto &E : RootN.edges()) { + for (const Edge &E : RootN.edges()) { if (MachineGadgetGraph::isGadgetEdge(E)) { if (ReachableNodes.contains(*E.getDest())) { // This gadget's sink is reachable @@ -598,8 +599,8 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( std::unique_ptr X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges( std::unique_ptr Graph) const { - MachineGadgetGraph::NodeSet ElimNodes{*Graph}; - MachineGadgetGraph::EdgeSet ElimEdges{*Graph}; + NodeSet ElimNodes{*Graph}; + EdgeSet ElimEdges{*Graph}; int RemainingGadgets = elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes); if (ElimEdges.empty() && ElimNodes.empty()) { @@ -630,11 +631,11 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin( auto Edges = std::make_unique(Graph->edges_size()); auto EdgeCuts = std::make_unique(Graph->edges_size()); auto EdgeValues = std::make_unique(Graph->edges_size()); - for (const auto &N : Graph->nodes()) { + for (const Node &N : Graph->nodes()) { Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin()); } Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node - for (const auto &E : Graph->edges()) { + for (const Edge &E : Graph->edges()) { Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest()); EdgeValues[Graph->getEdgeIndex(E)] = E.getValue(); } @@ -651,74 +652,67 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin( LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n"); LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n"); - Graph = GraphBuilder::trim(*Graph, MachineGadgetGraph::NodeSet{*Graph}, - CutEdges); + Graph = GraphBuilder::trim(*Graph, NodeSet{*Graph}, CutEdges); } while (true); return FencesInserted; } -int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithGreedyHeuristic( +int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithHeuristic( MachineFunction &MF, std::unique_ptr Graph) const { - LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n"); - Graph = trimMitigatedEdges(std::move(Graph)); - LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n"); + // If `MF` does not have any fences, then no gadgets would have been + // mitigated at this point. + if (Graph->NumFences > 0) { + LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n"); + Graph = trimMitigatedEdges(std::move(Graph)); + LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n"); + } + if (Graph->NumGadgets == 0) return 0; LLVM_DEBUG(dbgs() << "Cutting edges...\n"); - MachineGadgetGraph::NodeSet ElimNodes{*Graph}, GadgetSinks{*Graph}; - MachineGadgetGraph::EdgeSet ElimEdges{*Graph}, CutEdges{*Graph}; - auto IsCFGEdge = [&ElimEdges, &CutEdges](const MachineGadgetGraph::Edge &E) { - return !ElimEdges.contains(E) && !CutEdges.contains(E) && - MachineGadgetGraph::isCFGEdge(E); - }; - auto IsGadgetEdge = [&ElimEdges, - &CutEdges](const MachineGadgetGraph::Edge &E) { - return !ElimEdges.contains(E) && !CutEdges.contains(E) && - MachineGadgetGraph::isGadgetEdge(E); - }; - - // FIXME: this is O(E^2), we could probably do better. - do { - // Find the cheapest CFG edge that will eliminate a gadget (by being - // egress from a SOURCE node or ingress to a SINK node), and cut it. - const MachineGadgetGraph::Edge *CheapestSoFar = nullptr; - - // First, collect all gadget source and sink nodes. - MachineGadgetGraph::NodeSet GadgetSources{*Graph}, GadgetSinks{*Graph}; - for (const auto &N : Graph->nodes()) { - if (ElimNodes.contains(N)) + EdgeSet CutEdges{*Graph}; + + // Begin by collecting all ingress CFG edges for each node + DenseMap> IngressEdgeMap; + for (const Edge &E : Graph->edges()) + if (MachineGadgetGraph::isCFGEdge(E)) + IngressEdgeMap[E.getDest()].push_back(&E); + + // For each gadget edge, make cuts that guarantee the gadget will be + // mitigated. A computationally efficient way to achieve this is to either: + // (a) cut all egress CFG edges from the gadget source, or + // (b) cut all ingress CFG edges to the gadget sink. + // + // Moreover, the algorithm tries not to make a cut into a loop by preferring + // to make a (b)-type cut if the gadget source resides at a greater loop depth + // than the gadget sink, or an (a)-type cut otherwise. + for (const Node &N : Graph->nodes()) { + for (const Edge &E : N.edges()) { + if (!MachineGadgetGraph::isGadgetEdge(E)) continue; - for (const auto &E : N.edges()) { - if (IsGadgetEdge(E)) { - GadgetSources.insert(N); - GadgetSinks.insert(*E.getDest()); - } - } - } - // Next, look for the cheapest CFG edge which, when cut, is guaranteed to - // mitigate at least one gadget by either: - // (a) being egress from a gadget source, or - // (b) being ingress to a gadget sink. - for (const auto &N : Graph->nodes()) { - if (ElimNodes.contains(N)) - continue; - for (const auto &E : N.edges()) { - if (IsCFGEdge(E)) { - if (GadgetSources.contains(N) || GadgetSinks.contains(*E.getDest())) { - if (!CheapestSoFar || E.getValue() < CheapestSoFar->getValue()) - CheapestSoFar = &E; - } - } - } + SmallVector EgressEdges; + SmallVector &IngressEdges = IngressEdgeMap[E.getDest()]; + for (const Edge &EgressEdge : N.edges()) + if (MachineGadgetGraph::isCFGEdge(EgressEdge)) + EgressEdges.push_back(&EgressEdge); + + int EgressCutCost = 0, IngressCutCost = 0; + for (const Edge *EgressEdge : EgressEdges) + if (!CutEdges.contains(*EgressEdge)) + EgressCutCost += EgressEdge->getValue(); + for (const Edge *IngressEdge : IngressEdges) + if (!CutEdges.contains(*IngressEdge)) + IngressCutCost += IngressEdge->getValue(); + + auto &EdgesToCut = + IngressCutCost < EgressCutCost ? IngressEdges : EgressEdges; + for (const Edge *E : EdgesToCut) + CutEdges.insert(*E); } - - assert(CheapestSoFar && "Failed to cut an edge"); - CutEdges.insert(*CheapestSoFar); - ElimEdges.insert(*CheapestSoFar); - } while (elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes)); + } LLVM_DEBUG(dbgs() << "Cutting edges... Done\n"); LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n"); @@ -734,8 +728,8 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences( MachineFunction &MF, MachineGadgetGraph &G, EdgeSet &CutEdges /* in, out */) const { int FencesInserted = 0; - for (const auto &N : G.nodes()) { - for (const auto &E : N.edges()) { + for (const Node &N : G.nodes()) { + for (const Edge &E : N.edges()) { if (CutEdges.contains(E)) { MachineInstr *MI = N.getValue(), *Prev; MachineBasicBlock *MBB; // Insert an LFENCE in this MBB @@ -751,7 +745,7 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences( Prev = MI->getPrevNode(); // Remove all egress CFG edges from this branch because the inserted // LFENCE prevents gadgets from crossing the branch. - for (const auto &E : N.edges()) { + for (const Edge &E : N.edges()) { if (MachineGadgetGraph::isCFGEdge(E)) CutEdges.insert(E); } diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 9ce2a4637e2ea..af86c1fc7206d 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1079,29 +1079,30 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, } } -/// Return the longest nop which can be efficiently decoded for the given -/// target cpu. 15-bytes is the longest single NOP instruction, but some -/// platforms can't decode the longest forms efficiently. -static unsigned maxLongNopLength(const X86Subtarget *Subtarget) { - if (Subtarget->getFeatureBits()[X86::ProcIntelSLM]) - return 7; - if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP]) - return 15; - if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP]) - return 11; - if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit()) - return 10; - if (Subtarget->is32Bit()) - return 2; - return 1; -} - /// Emit the largest nop instruction smaller than or equal to \p NumBytes /// bytes. Return the size of nop emitted. static unsigned emitNop(MCStreamer &OS, unsigned NumBytes, const X86Subtarget *Subtarget) { + // Determine the longest nop which can be efficiently decoded for the given + // target cpu. 15-bytes is the longest single NOP instruction, but some + // platforms can't decode the longest forms efficiently. + unsigned MaxNopLength = 1; + if (Subtarget->is64Bit()) { + // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the + // IndexReg/BaseReg below need to be updated. + if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP)) + MaxNopLength = 7; + else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP)) + MaxNopLength = 15; + else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP)) + MaxNopLength = 11; + else + MaxNopLength = 10; + } if (Subtarget->is32Bit()) + MaxNopLength = 2; + // Cap a single nop emission at the profitable value for the target - NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget)); + NumBytes = std::min(NumBytes, MaxNopLength); unsigned NopSize; unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index ff2a94bcac8e2..095aea3a10636 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -231,15 +231,11 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (CPU.empty()) CPU = "generic"; - std::string FullFS = std::string(FS); - if (In64BitMode) { - // SSE2 should default to enabled in 64-bit mode, but can be turned off - // explicitly. - if (!FullFS.empty()) - FullFS = "+sse2," + FullFS; - else - FullFS = "+sse2"; - } + std::string FullFS = X86_MC::ParseX86Triple(TargetTriple); + assert(!FullFS.empty() && "Failed to parse X86 triple"); + + if (!FS.empty()) + FullFS = (Twine(FullFS) + "," + FS).str(); // Parse features string and set the CPU. ParseSubtargetFeatures(CPU, FullFS); @@ -251,17 +247,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (hasSSE42() || hasSSE4A()) IsUAMem16Slow = false; - // It's important to keep the MCSubtargetInfo feature bits in sync with - // target data structure which is shared with MC code emitter, etc. - if (In64BitMode) - ToggleFeature(X86::Mode64Bit); - else if (In32BitMode) - ToggleFeature(X86::Mode32Bit); - else if (In16BitMode) - ToggleFeature(X86::Mode16Bit); - else - llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!"); - LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel << ", 3DNowLevel " << X863DNowLevel << ", 64bit " << HasX86_64 << "\n"); @@ -277,17 +262,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { isTargetKFreeBSD() || In64BitMode) stackAlignment = Align(16); - // Some CPUs have more overhead for gather. The specified overhead is relative - // to the Load operation. "2" is the number provided by Intel architects. This - // parameter is used for cost estimation of Gather Op and comparison with - // other alternatives. - // TODO: Remove the explicit hasAVX512()?, That would mean we would only - // enable gather with a -march. - if (hasAVX512() || (hasAVX2() && hasFastGather())) - GatherOverhead = 2; - if (hasAVX512()) - ScatterOverhead = 2; - // Consume the vector width attribute or apply any target specific limit. if (PreferVectorWidthOverride) PreferVectorWidth = PreferVectorWidthOverride; @@ -312,11 +286,6 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), RequiredVectorWidth(RequiredVectorWidth), - In64BitMode(TargetTriple.getArch() == Triple::x86_64), - In32BitMode(TargetTriple.getArch() == Triple::x86 && - TargetTriple.getEnvironment() != Triple::CODE16), - In16BitMode(TargetTriple.getArch() == Triple::x86 && - TargetTriple.getEnvironment() == Triple::CODE16), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 6f6823dea8ffd..c93fa082eeb60 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -468,9 +468,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Indicates target prefers AVX512 mask registers. bool PreferMaskRegisters = false; - /// Threeway branch is profitable in this subtarget. - bool ThreewayBranchProfitable = false; - /// Use Goldmont specific floating point div/sqrt costs. bool UseGLMDivSqrtCosts = false; @@ -498,17 +495,13 @@ class X86Subtarget final : public X86GenSubtargetInfo { unsigned RequiredVectorWidth; /// True if compiling for 64-bit, false for 16-bit or 32-bit. - bool In64BitMode; + bool In64BitMode = false; /// True if compiling for 32-bit, false for 16-bit or 64-bit. - bool In32BitMode; + bool In32BitMode = false; /// True if compiling for 16-bit, false for 32-bit or 64-bit. - bool In16BitMode; - - /// Contains the Overhead of gather\scatter instructions - int GatherOverhead = 1024; - int ScatterOverhead = 1024; + bool In16BitMode = false; X86SelectionDAGInfo TSInfo; // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which @@ -678,8 +671,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool isPMADDWDSlow() const { return IsPMADDWDSlow; } bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } - int getGatherOverhead() const { return GatherOverhead; } - int getScatterOverhead() const { return ScatterOverhead; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); } bool useLeaForSP() const { return UseLeaForSP; } @@ -729,7 +720,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasWAITPKG() const { return HasWAITPKG; } bool hasPCONFIG() const { return HasPCONFIG; } bool hasSGX() const { return HasSGX; } - bool threewayBranchProfitable() const { return ThreewayBranchProfitable; } bool hasINVPCID() const { return HasINVPCID; } bool hasENQCMD() const { return HasENQCMD; } bool hasSERIALIZE() const { return HasSERIALIZE; } @@ -787,8 +777,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool isXRaySupported() const override { return is64Bit(); } - X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; } - /// TODO: to be removed later and replaced with suitable properties bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a9ea245f7027..685a8e8fa8774 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -56,11 +56,6 @@ static cl::opt EnableMachineCombinerPass("x86-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); -static cl::opt EnableCondBrFoldingPass("x86-condbr-folding", - cl::desc("Enable the conditional branch " - "folding pass"), - cl::init(false), cl::Hidden); - extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine X(getTheX86_32Target()); @@ -84,7 +79,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); - initializeX86CondBrFoldingPassPass(PR); initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); @@ -317,6 +311,14 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } +bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + assert(SrcAS != DestAS && "Expected different address spaces!"); + if (getPointerSize(SrcAS) != getPointerSize(DestAS)) + return false; + return SrcAS < 256 && DestAS < 256; +} + //===----------------------------------------------------------------------===// // X86 TTI query. //===----------------------------------------------------------------------===// @@ -456,8 +458,6 @@ bool X86PassConfig::addGlobalInstructionSelect() { } bool X86PassConfig::addILPOpts() { - if (EnableCondBrFoldingPass) - addPass(createX86CondBrFolding()); addPass(&EarlyIfConverterID); if (EnableMachineCombinerPass) addPass(&MachineCombinerID); diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h index 8d98474a39c06..69d7e48b89778 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.h +++ b/llvm/lib/Target/X86/X86TargetMachine.h @@ -54,6 +54,8 @@ class X86TargetMachine final : public LLVMTargetMachine { } bool isJIT() const { return IsJIT; } + + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index cc18e55656ef5..c9179742bcb9c 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1367,6 +1367,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, } int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -1988,7 +1989,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // The function getSimpleVT only handles simple value types. if (!SrcTy.isSimple() || !DstTy.isSimple()) - return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind)); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind)); MVT SimpleSrcTy = SrcTy.getSimpleVT(); MVT SimpleDstTy = DstTy.getSimpleVT(); @@ -2049,7 +2050,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return AdjustCost(Entry->Cost); } - return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, @@ -2270,6 +2272,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTLZ, MVT::v16i8, 4 }, }; static const CostTblEntry AVX512BWCostTbl[] = { + { ISD::ABS, MVT::v32i16, 1 }, + { ISD::ABS, MVT::v64i8, 1 }, { ISD::BITREVERSE, MVT::v8i64, 5 }, { ISD::BITREVERSE, MVT::v16i32, 5 }, { ISD::BITREVERSE, MVT::v32i16, 5 }, @@ -2288,14 +2292,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v64i8, 9 }, { ISD::SADDSAT, MVT::v32i16, 1 }, { ISD::SADDSAT, MVT::v64i8, 1 }, + { ISD::SMAX, MVT::v32i16, 1 }, + { ISD::SMAX, MVT::v64i8, 1 }, + { ISD::SMIN, MVT::v32i16, 1 }, + { ISD::SMIN, MVT::v64i8, 1 }, { ISD::SSUBSAT, MVT::v32i16, 1 }, { ISD::SSUBSAT, MVT::v64i8, 1 }, { ISD::UADDSAT, MVT::v32i16, 1 }, { ISD::UADDSAT, MVT::v64i8, 1 }, + { ISD::UMAX, MVT::v32i16, 1 }, + { ISD::UMAX, MVT::v64i8, 1 }, + { ISD::UMIN, MVT::v32i16, 1 }, + { ISD::UMIN, MVT::v64i8, 1 }, { ISD::USUBSAT, MVT::v32i16, 1 }, { ISD::USUBSAT, MVT::v64i8, 1 }, }; static const CostTblEntry AVX512CostTbl[] = { + { ISD::ABS, MVT::v8i64, 1 }, + { ISD::ABS, MVT::v16i32, 1 }, + { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split + { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split + { ISD::ABS, MVT::v4i64, 1 }, + { ISD::ABS, MVT::v2i64, 1 }, { ISD::BITREVERSE, MVT::v8i64, 36 }, { ISD::BITREVERSE, MVT::v16i32, 24 }, { ISD::BITREVERSE, MVT::v32i16, 10 }, @@ -2312,6 +2330,30 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v16i32, 28 }, { ISD::CTTZ, MVT::v32i16, 24 }, { ISD::CTTZ, MVT::v64i8, 18 }, + { ISD::SMAX, MVT::v8i64, 1 }, + { ISD::SMAX, MVT::v16i32, 1 }, + { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split + { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split + { ISD::SMAX, MVT::v4i64, 1 }, + { ISD::SMAX, MVT::v2i64, 1 }, + { ISD::SMIN, MVT::v8i64, 1 }, + { ISD::SMIN, MVT::v16i32, 1 }, + { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split + { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split + { ISD::SMIN, MVT::v4i64, 1 }, + { ISD::SMIN, MVT::v2i64, 1 }, + { ISD::UMAX, MVT::v8i64, 1 }, + { ISD::UMAX, MVT::v16i32, 1 }, + { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split + { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split + { ISD::UMAX, MVT::v4i64, 1 }, + { ISD::UMAX, MVT::v2i64, 1 }, + { ISD::UMIN, MVT::v8i64, 1 }, + { ISD::UMIN, MVT::v16i32, 1 }, + { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split + { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split + { ISD::UMIN, MVT::v4i64, 1 }, + { ISD::UMIN, MVT::v2i64, 1 }, { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq @@ -2352,6 +2394,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::BITREVERSE, MVT::i8, 3 } }; static const CostTblEntry AVX2CostTbl[] = { + { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X) + { ISD::ABS, MVT::v8i32, 1 }, + { ISD::ABS, MVT::v16i16, 1 }, + { ISD::ABS, MVT::v32i8, 1 }, { ISD::BITREVERSE, MVT::v4i64, 5 }, { ISD::BITREVERSE, MVT::v8i32, 5 }, { ISD::BITREVERSE, MVT::v16i16, 5 }, @@ -2373,11 +2419,23 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v32i8, 9 }, { ISD::SADDSAT, MVT::v16i16, 1 }, { ISD::SADDSAT, MVT::v32i8, 1 }, + { ISD::SMAX, MVT::v8i32, 1 }, + { ISD::SMAX, MVT::v16i16, 1 }, + { ISD::SMAX, MVT::v32i8, 1 }, + { ISD::SMIN, MVT::v8i32, 1 }, + { ISD::SMIN, MVT::v16i16, 1 }, + { ISD::SMIN, MVT::v32i8, 1 }, { ISD::SSUBSAT, MVT::v16i16, 1 }, { ISD::SSUBSAT, MVT::v32i8, 1 }, { ISD::UADDSAT, MVT::v16i16, 1 }, { ISD::UADDSAT, MVT::v32i8, 1 }, { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd + { ISD::UMAX, MVT::v8i32, 1 }, + { ISD::UMAX, MVT::v16i16, 1 }, + { ISD::UMAX, MVT::v32i8, 1 }, + { ISD::UMIN, MVT::v8i32, 1 }, + { ISD::UMIN, MVT::v16i16, 1 }, + { ISD::UMIN, MVT::v32i8, 1 }, { ISD::USUBSAT, MVT::v16i16, 1 }, { ISD::USUBSAT, MVT::v32i8, 1 }, { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd @@ -2389,6 +2447,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ }; static const CostTblEntry AVX1CostTbl[] = { + { ISD::ABS, MVT::v4i64, 6 }, // VBLENDVPD(X,VPSUBQ(0,X),X) + { ISD::ABS, MVT::v8i32, 3 }, + { ISD::ABS, MVT::v16i16, 3 }, + { ISD::ABS, MVT::v32i8, 3 }, { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert @@ -2410,11 +2472,23 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert + { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert @@ -2444,12 +2518,26 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd }; static const CostTblEntry SSE42CostTbl[] = { + { ISD::ABS, MVT::v2i64, 3 }, // BLENDVPD(X,PSUBQ(0,X),X) { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ }; + static const CostTblEntry SSE41CostTbl[] = { + { ISD::SMAX, MVT::v4i32, 1 }, + { ISD::SMAX, MVT::v16i8, 1 }, + { ISD::SMIN, MVT::v4i32, 1 }, + { ISD::SMIN, MVT::v16i8, 1 }, + { ISD::UMAX, MVT::v4i32, 1 }, + { ISD::UMAX, MVT::v8i16, 1 }, + { ISD::UMIN, MVT::v4i32, 1 }, + { ISD::UMIN, MVT::v8i16, 1 }, + }; static const CostTblEntry SSSE3CostTbl[] = { + { ISD::ABS, MVT::v4i32, 1 }, + { ISD::ABS, MVT::v8i16, 1 }, + { ISD::ABS, MVT::v16i8, 1 }, { ISD::BITREVERSE, MVT::v2i64, 5 }, { ISD::BITREVERSE, MVT::v4i32, 5 }, { ISD::BITREVERSE, MVT::v8i16, 5 }, @@ -2471,6 +2559,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v16i8, 9 } }; static const CostTblEntry SSE2CostTbl[] = { + { ISD::ABS, MVT::v2i64, 4 }, + { ISD::ABS, MVT::v4i32, 3 }, + { ISD::ABS, MVT::v8i16, 3 }, + { ISD::ABS, MVT::v16i8, 3 }, { ISD::BITREVERSE, MVT::v2i64, 29 }, { ISD::BITREVERSE, MVT::v4i32, 27 }, { ISD::BITREVERSE, MVT::v8i16, 27 }, @@ -2492,10 +2584,14 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v16i8, 13 }, { ISD::SADDSAT, MVT::v8i16, 1 }, { ISD::SADDSAT, MVT::v16i8, 1 }, + { ISD::SMAX, MVT::v8i16, 1 }, + { ISD::SMIN, MVT::v8i16, 1 }, { ISD::SSUBSAT, MVT::v8i16, 1 }, { ISD::SSUBSAT, MVT::v16i8, 1 }, { ISD::UADDSAT, MVT::v8i16, 1 }, { ISD::UADDSAT, MVT::v16i8, 1 }, + { ISD::UMAX, MVT::v16i8, 1 }, + { ISD::UMIN, MVT::v16i8, 1 }, { ISD::USUBSAT, MVT::v8i16, 1 }, { ISD::USUBSAT, MVT::v16i8, 1 }, { ISD::FMAXNUM, MVT::f64, 4 }, @@ -2569,6 +2665,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( switch (IID) { default: break; + case Intrinsic::abs: + ISD = ISD::ABS; + break; case Intrinsic::bitreverse: ISD = ISD::BITREVERSE; break; @@ -2592,12 +2691,24 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( case Intrinsic::sadd_sat: ISD = ISD::SADDSAT; break; + case Intrinsic::smax: + ISD = ISD::SMAX; + break; + case Intrinsic::smin: + ISD = ISD::SMIN; + break; case Intrinsic::ssub_sat: ISD = ISD::SSUBSAT; break; case Intrinsic::uadd_sat: ISD = ISD::UADDSAT; break; + case Intrinsic::umax: + ISD = ISD::UMAX; + break; + case Intrinsic::umin: + ISD = ISD::UMIN; + break; case Intrinsic::usub_sat: ISD = ISD::USUBSAT; break; @@ -2660,6 +2771,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) return LT.first * Entry->Cost; + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasSSSE3()) if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -3848,6 +3963,26 @@ X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { return CostKind == TTI::TCK_RecipThroughput ? 0 : 1; } +int X86TTIImpl::getGatherOverhead() const { + // Some CPUs have more overhead for gather. The specified overhead is relative + // to the Load operation. "2" is the number provided by Intel architects. This + // parameter is used for cost estimation of Gather Op and comparison with + // other alternatives. + // TODO: Remove the explicit hasAVX512()?, That would mean we would only + // enable gather with a -march. + if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) + return 2; + + return 1024; +} + +int X86TTIImpl::getScatterOverhead() const { + if (ST->hasAVX512()) + return 2; + + return 1024; +} + // Return an average cost of Gather / Scatter instruction, maybe improved later int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, Align Alignment, unsigned AddressSpace) { @@ -3906,8 +4041,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. const int GSOverhead = (Opcode == Instruction::Load) - ? ST->getGatherOverhead() - : ST->getScatterOverhead(); + ? getGatherOverhead() + : getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), MaybeAlign(Alignment), AddressSpace, TTI::TCK_RecipThroughput); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index ca875fa68523c..8d2fa27ee7b00 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -62,7 +62,6 @@ class X86TTIImpl : public BasicTTIImplBase { X86::FeatureLZCNTFalseDeps, X86::FeatureBranchFusion, X86::FeatureMacroFusion, - X86::FeatureMergeToThreeWayBranch, X86::FeaturePadShortFunctions, X86::FeaturePOPCNTFalseDeps, X86::FeatureSSEUnalignedMem, @@ -131,7 +130,7 @@ class X86TTIImpl : public BasicTTIImplBase { int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, TTI::TargetCostKind CostKind, @@ -244,6 +243,9 @@ class X86TTIImpl : public BasicTTIImplBase { int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr, Align Alignment, unsigned AddressSpace); + int getGatherOverhead() const; + int getScatterOverhead() const; + /// @} }; diff --git a/llvm/lib/Testing/Support/CMakeLists.txt b/llvm/lib/Testing/Support/CMakeLists.txt index 595221a105cd3..fe460aeefc91f 100644 --- a/llvm/lib/Testing/Support/CMakeLists.txt +++ b/llvm/lib/Testing/Support/CMakeLists.txt @@ -1,10 +1,6 @@ add_definitions(-DGTEST_LANG_CXX11=1) add_definitions(-DGTEST_HAS_TR1_TUPLE=0) -if (CXX_SUPPORTS_SUGGEST_OVERRIDE_FLAG) - add_compile_options("-Wno-suggest-override") -endif() - add_llvm_library(LLVMTestingSupport Annotations.cpp Error.cpp diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp index c409012554245..cd39428b9c38e 100644 --- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -191,9 +191,11 @@ static void appendFile(std::vector &Members, file_magic Magic = identify_magic(MB.getBuffer()); if (Magic != file_magic::coff_object && Magic != file_magic::bitcode && - Magic != file_magic::archive && Magic != file_magic::windows_resource) { + Magic != file_magic::archive && Magic != file_magic::windows_resource && + Magic != file_magic::coff_import_library) { llvm::errs() << MB.getBufferIdentifier() - << ": not a COFF object, bitcode, archive or resource file\n"; + << ": not a COFF object, bitcode, archive, import library or " + "resource file\n"; exit(1); } diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index ad0d7eb51507a..d511ad2729abc 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -153,10 +154,6 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, } else if (I->use_empty()) { // Dead argument (which are always marked as promotable) ++NumArgumentsDead; - - // There may be remaining metadata uses of the argument for things like - // llvm.dbg.value. Replace them with undef. - I->replaceAllUsesWith(UndefValue::get(I->getType())); } else { // Okay, this is being promoted. This means that the only uses are loads // or GEPs which are only used by loads @@ -414,6 +411,11 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, continue; } + // There potentially are metadata uses for things like llvm.dbg.value. + // Replace them with undef, after handling the other regular uses. + auto RauwUndefMetadata = make_scope_exit( + [&]() { I->replaceAllUsesWith(UndefValue::get(I->getType())); }); + if (I->use_empty()) continue; @@ -465,7 +467,6 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, GEP->eraseFromParent(); } } - // Increment I2 past all of the arguments added for this promoted pointer. std::advance(I2, ArgIndices.size()); } diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index f96dac5f3515c..89971cc9c2946 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -15,7 +15,11 @@ #include "llvm/Transforms/IPO/Attributor.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ValueTracking.h" @@ -25,10 +29,15 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include +#include using namespace llvm; @@ -85,6 +94,23 @@ static cl::list "allowed to be seeded."), cl::ZeroOrMore, cl::CommaSeparated); +static cl::opt + DumpDepGraph("attributor-dump-dep-graph", cl::Hidden, + cl::desc("Dump the dependency graph to dot files."), + cl::init(false)); + +static cl::opt DepGraphDotFileNamePrefix( + "attributor-depgraph-dot-filename-prefix", cl::Hidden, + cl::desc("The prefix used for the CallGraph dot file names.")); + +static cl::opt ViewDepGraph("attributor-view-dep-graph", cl::Hidden, + cl::desc("View the dependency graph."), + cl::init(false)); + +static cl::opt PrintDependencies("attributor-print-dep", cl::Hidden, + cl::desc("Print attribute dependencies"), + cl::init(false)); + /// Logic operators for the change status enum class. /// ///{ @@ -498,8 +524,10 @@ Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA, Attributor::~Attributor() { // The abstract attributes are allocated via the BumpPtrAllocator Allocator, // thus we cannot delete them. We can, and want to, destruct them though. - for (AbstractAttribute *AA : AllAbstractAttributes) + for (auto &DepAA : DG.SyntheticRoot.Deps) { + AbstractAttribute *AA = cast(DepAA.getPointer()); AA->~AbstractAttribute(); + } } bool Attributor::isAssumedDead(const AbstractAttribute &AA, @@ -903,8 +931,9 @@ bool Attributor::checkForAllReadWriteInstructions( } void Attributor::runTillFixpoint() { + TimeTraceScope TimeScope("Attributor::runTillFixpoint"); LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized " - << AllAbstractAttributes.size() + << DG.SyntheticRoot.Deps.size() << " abstract attributes.\n"); // Now that all abstract attributes are collected and initialized we start @@ -914,11 +943,11 @@ void Attributor::runTillFixpoint() { SmallVector ChangedAAs; SetVector Worklist, InvalidAAs; - Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end()); + Worklist.insert(DG.SyntheticRoot.begin(), DG.SyntheticRoot.end()); do { // Remember the size to determine new attributes. - size_t NumAAs = AllAbstractAttributes.size(); + size_t NumAAs = DG.SyntheticRoot.Deps.size(); LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter << ", Worklist size: " << Worklist.size() << "\n"); @@ -935,7 +964,7 @@ void Attributor::runTillFixpoint() { while (!InvalidAA->Deps.empty()) { const auto &Dep = InvalidAA->Deps.back(); InvalidAA->Deps.pop_back(); - AbstractAttribute *DepAA = Dep.getPointer(); + AbstractAttribute *DepAA = cast(Dep.getPointer()); if (Dep.getInt() == unsigned(DepClassTy::OPTIONAL)) { Worklist.insert(DepAA); continue; @@ -953,7 +982,8 @@ void Attributor::runTillFixpoint() { // changed to the work list. for (AbstractAttribute *ChangedAA : ChangedAAs) while (!ChangedAA->Deps.empty()) { - Worklist.insert(ChangedAA->Deps.back().getPointer()); + Worklist.insert( + cast(ChangedAA->Deps.back().getPointer())); ChangedAA->Deps.pop_back(); } @@ -981,8 +1011,8 @@ void Attributor::runTillFixpoint() { // Add attributes to the changed set if they have been created in the last // iteration. - ChangedAAs.append(AllAbstractAttributes.begin() + NumAAs, - AllAbstractAttributes.end()); + ChangedAAs.append(DG.SyntheticRoot.begin() + NumAAs, + DG.SyntheticRoot.end()); // Reset the work list and repopulate with the changed abstract attributes. // Note that dependent ones are added above. @@ -1015,7 +1045,8 @@ void Attributor::runTillFixpoint() { } while (!ChangedAA->Deps.empty()) { - ChangedAAs.push_back(ChangedAA->Deps.back().getPointer()); + ChangedAAs.push_back( + cast(ChangedAA->Deps.back().getPointer())); ChangedAA->Deps.pop_back(); } } @@ -1037,12 +1068,14 @@ void Attributor::runTillFixpoint() { } ChangeStatus Attributor::manifestAttributes() { - size_t NumFinalAAs = AllAbstractAttributes.size(); + TimeTraceScope TimeScope("Attributor::manifestAttributes"); + size_t NumFinalAAs = DG.SyntheticRoot.Deps.size(); unsigned NumManifested = 0; unsigned NumAtFixpoint = 0; ChangeStatus ManifestChange = ChangeStatus::UNCHANGED; - for (AbstractAttribute *AA : AllAbstractAttributes) { + for (auto &DepAA : DG.SyntheticRoot.Deps) { + AbstractAttribute *AA = cast(DepAA.getPointer()); AbstractState &State = AA->getState(); // If there is not already a fixpoint reached, we can now take the @@ -1082,11 +1115,14 @@ ChangeStatus Attributor::manifestAttributes() { NumAttributesValidFixpoint += NumAtFixpoint; (void)NumFinalAAs; - if (NumFinalAAs != AllAbstractAttributes.size()) { - for (unsigned u = NumFinalAAs; u < AllAbstractAttributes.size(); ++u) - errs() << "Unexpected abstract attribute: " << *AllAbstractAttributes[u] + if (NumFinalAAs != DG.SyntheticRoot.Deps.size()) { + for (unsigned u = NumFinalAAs; u < DG.SyntheticRoot.Deps.size(); ++u) + errs() << "Unexpected abstract attribute: " + << cast(DG.SyntheticRoot.Deps[u].getPointer()) << " :: " - << AllAbstractAttributes[u]->getIRPosition().getAssociatedValue() + << cast(DG.SyntheticRoot.Deps[u].getPointer()) + ->getIRPosition() + .getAssociatedValue() << "\n"; llvm_unreachable("Expected the final number of abstract attributes to " "remain unchanged!"); @@ -1095,6 +1131,7 @@ ChangeStatus Attributor::manifestAttributes() { } ChangeStatus Attributor::cleanupIR() { + TimeTraceScope TimeScope("Attributor::cleanupIR"); // Delete stuff at the end to avoid invalid references and a nice order. LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least " << ToBeDeletedFunctions.size() << " functions and " @@ -1263,14 +1300,29 @@ ChangeStatus Attributor::cleanupIR() { } ChangeStatus Attributor::run() { + TimeTraceScope TimeScope("Attributor::run"); + SeedingPeriod = false; runTillFixpoint(); + + // dump graphs on demand + if (DumpDepGraph) + DG.dumpGraph(); + + if (ViewDepGraph) + DG.viewGraph(); + + if (PrintDependencies) + DG.print(); + ChangeStatus ManifestChange = manifestAttributes(); ChangeStatus CleanupChange = cleanupIR(); return ManifestChange | CleanupChange; } ChangeStatus Attributor::updateAA(AbstractAttribute &AA) { + TimeTraceScope TimeScope(AA.getName() + "::updateAA"); + // Use a new dependence vector for this update. DependenceVector DV; DependenceStack.push_back(&DV); @@ -2028,8 +2080,31 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) { } void AbstractAttribute::print(raw_ostream &OS) const { - OS << "[P: " << getIRPosition() << "][" << getAsStr() << "][S: " << getState() - << "]"; + OS << "["; + OS << getName(); + OS << "] for CtxI "; + + if (auto *I = getCtxI()) { + OS << "'"; + I->print(OS); + OS << "'"; + } else + OS << "<>"; + + OS << " at position " << getIRPosition() << " with state " << getAsStr() + << '\n'; +} + +void AbstractAttribute::printWithDeps(raw_ostream &OS) const { + print(OS); + + for (const auto &DepAA : Deps) { + auto *AA = DepAA.getPointer(); + OS << " updates "; + AA->print(OS); + } + + OS << '\n'; } ///} @@ -2064,8 +2139,8 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache, NumFnWithoutExactDefinition++; // We look at internal functions only on-demand but if any use is not a - // direct call or outside the current set of analyzed functions, we have to - // do it eagerly. + // direct call or outside the current set of analyzed functions, we have + // to do it eagerly. if (F->hasLocalLinkage()) { if (llvm::all_of(F->uses(), [&Functions](const Use &U) { const auto *CB = dyn_cast(U.getUser()); @@ -2081,11 +2156,41 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache, } ChangeStatus Changed = A.run(); + LLVM_DEBUG(dbgs() << "[Attributor] Done with " << Functions.size() << " functions, result: " << Changed << ".\n"); return Changed == ChangeStatus::CHANGED; } +void AADepGraph::viewGraph() { llvm::ViewGraph(this, "Dependency Graph"); } + +void AADepGraph::dumpGraph() { + static std::atomic CallTimes; + std::string Prefix; + + if (!DepGraphDotFileNamePrefix.empty()) + Prefix = DepGraphDotFileNamePrefix; + else + Prefix = "dep_graph"; + std::string Filename = + Prefix + "_" + std::to_string(CallTimes.load()) + ".dot"; + + outs() << "Dependency graph dump to " << Filename << ".\n"; + + std::error_code EC; + + raw_fd_ostream File(Filename, EC, sys::fs::OF_Text); + if (!EC) + llvm::WriteGraph(File, this); + + CallTimes++; +} + +void AADepGraph::print() { + for (auto DepAA : SyntheticRoot.Deps) + cast(DepAA.getPointer())->printWithDeps(outs()); +} + PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) { FunctionAnalysisManager &FAM = AM.getResult(M).getManager(); @@ -2132,6 +2237,51 @@ PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C, return PreservedAnalyses::all(); } +namespace llvm { + +template <> struct GraphTraits { + using NodeRef = AADepGraphNode *; + using DepTy = PointerIntPair; + using EdgeRef = PointerIntPair; + + static NodeRef getEntryNode(AADepGraphNode *DGN) { return DGN; } + static NodeRef DepGetVal(DepTy &DT) { return DT.getPointer(); } + + using ChildIteratorType = + mapped_iterator::iterator, decltype(&DepGetVal)>; + using ChildEdgeIteratorType = TinyPtrVector::iterator; + + static ChildIteratorType child_begin(NodeRef N) { return N->child_begin(); } + + static ChildIteratorType child_end(NodeRef N) { return N->child_end(); } +}; + +template <> +struct GraphTraits : public GraphTraits { + static NodeRef getEntryNode(AADepGraph *DG) { return DG->GetEntryNode(); } + + using nodes_iterator = + mapped_iterator::iterator, decltype(&DepGetVal)>; + + static nodes_iterator nodes_begin(AADepGraph *DG) { return DG->begin(); } + + static nodes_iterator nodes_end(AADepGraph *DG) { return DG->end(); } +}; + +template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { + DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} + + static std::string getNodeLabel(const AADepGraphNode *Node, + const AADepGraph *DG) { + std::string AAString = ""; + raw_string_ostream O(AAString); + Node->print(O); + return AAString; + } +}; + +} // end namespace llvm + namespace { struct AttributorLegacyPass : public ModulePass { diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index dc916089d394e..89d359bcbe034 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -13,13 +13,16 @@ #include "llvm/Transforms/IPO/Attributor.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumeBundleQueries.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -1052,9 +1055,10 @@ ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) { // map, NewRVsMap. decltype(ReturnedValues) NewRVsMap; - auto HandleReturnValue = [&](Value *RV, SmallSetVector &RIs) { - LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV - << " by #" << RIs.size() << " RIs\n"); + auto HandleReturnValue = [&](Value *RV, + SmallSetVector &RIs) { + LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV << " by #" + << RIs.size() << " RIs\n"); CallBase *CB = dyn_cast(RV); if (!CB || UnresolvedCalls.count(CB)) return; @@ -1979,6 +1983,61 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { return true; }; + auto InspectCallSiteForUB = [&](Instruction &I) { + // Check whether a callsite always cause UB or not + + // Skip instructions that are already saved. + if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I)) + return true; + + // Check nonnull and noundef argument attribute violation for each + // callsite. + CallBase &CB = cast(I); + Function *Callee = CB.getCalledFunction(); + if (!Callee) + return true; + for (unsigned idx = 0; idx < CB.getNumArgOperands(); idx++) { + // If current argument is known to be simplified to null pointer and the + // corresponding argument position is known to have nonnull attribute, + // the argument is poison. Furthermore, if the argument is poison and + // the position is known to have noundef attriubte, this callsite is + // considered UB. + // TODO: Check also nopoison attribute if it is introduced. + if (idx >= Callee->arg_size()) + break; + Value *ArgVal = CB.getArgOperand(idx); + if (!ArgVal) + continue; + IRPosition CalleeArgumentIRP = + IRPosition::argument(*Callee->getArg(idx)); + if (!CalleeArgumentIRP.hasAttr({Attribute::NoUndef})) + continue; + auto &NonNullAA = A.getAAFor(*this, CalleeArgumentIRP); + if (!NonNullAA.isKnownNonNull()) + continue; + const auto &ValueSimplifyAA = + A.getAAFor(*this, IRPosition::value(*ArgVal)); + Optional SimplifiedVal = + ValueSimplifyAA.getAssumedSimplifiedValue(A); + + if (!ValueSimplifyAA.isKnown()) + continue; + // Here, we handle three cases. + // (1) Not having a value means it is dead. (we can replace the value + // with undef) + // (2) Simplified to null pointer. The argument is a poison value and + // violate noundef attribute. + // (3) Simplified to undef. The argument violate noundef attriubte. + if (!SimplifiedVal.hasValue() || + isa(*SimplifiedVal.getValue()) || + isa(*SimplifiedVal.getValue())) { + KnownUBInsts.insert(&I); + return true; + } + } + return true; + }; + A.checkForAllInstructions(InspectMemAccessInstForUB, *this, {Instruction::Load, Instruction::Store, Instruction::AtomicCmpXchg, @@ -1986,6 +2045,7 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { /* CheckBBLivenessOnly */ true); A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br}, /* CheckBBLivenessOnly */ true); + A.checkForAllCallLikeInstructions(InspectCallSiteForUB, *this); if (NoUBPrevSize != AssumedNoUBInsts.size() || UBPrevSize != KnownUBInsts.size()) return ChangeStatus::CHANGED; @@ -3425,7 +3485,6 @@ struct AADereferenceableFloating : AADereferenceableImpl { T.GlobalState &= DS.GlobalState; } - // For now we do not try to "increase" dereferenceability due to negative // indices as we first have to come up with code to deal with loops and // for overflows of the dereferenceable bytes. @@ -5418,8 +5477,7 @@ struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl { /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...) Optional identifyPrivatizableType(Attributor &A) override { - Value *Obj = - GetUnderlyingObject(&getAssociatedValue(), A.getInfoCache().getDL()); + Value *Obj = getUnderlyingObject(&getAssociatedValue()); if (!Obj) { LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] No underlying object found!\n"); return nullptr; diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt index 3f30c0289e8c3..a17ef63371063 100644 --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -18,7 +18,6 @@ add_llvm_component_library(LLVMipo GlobalOpt.cpp GlobalSplit.cpp HotColdSplitting.cpp - IPConstantPropagation.cpp IPO.cpp InferFunctionAttrs.cpp InlineSimple.cpp diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 4baeaa6e16304..5f70f8eaebb65 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -63,7 +63,7 @@ using namespace llvm; -#define DEBUG_TYPE "functionattrs" +#define DEBUG_TYPE "function-attrs" STATISTIC(NumReadNone, "Number of functions marked readnone"); STATISTIC(NumReadOnly, "Number of functions marked readonly"); @@ -1477,11 +1477,11 @@ struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass { } // end anonymous namespace char PostOrderFunctionAttrsLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "functionattrs", +INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "function-attrs", "Deduce function attributes", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "functionattrs", +INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "function-attrs", "Deduce function attributes", false, false) Pass *llvm::createPostOrderFunctionAttrsLegacyPass() { @@ -1542,11 +1542,13 @@ struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass { char ReversePostOrderFunctionAttrsLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs", - "Deduce function attributes in RPO", false, false) +INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass, + "rpo-function-attrs", "Deduce function attributes in RPO", + false, false) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass, "rpo-functionattrs", - "Deduce function attributes in RPO", false, false) +INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass, + "rpo-function-attrs", "Deduce function attributes in RPO", + false, false) Pass *llvm::createReversePostOrderFunctionAttrsPass() { return new ReversePostOrderFunctionAttrsLegacyPass(); diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 468bf19f2e48a..e02f8d62da7a0 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -884,6 +884,7 @@ void llvm::computeDeadSymbols( while (!Worklist.empty()) { auto VI = Worklist.pop_back_val(); for (auto &Summary : VI.getSummaryList()) { + Summary->setLive(true); if (auto *AS = dyn_cast(Summary.get())) { // If this is an alias, visit the aliasee VI to ensure that all copies // are marked live and it is added to the worklist for further @@ -891,8 +892,6 @@ void llvm::computeDeadSymbols( visit(AS->getAliaseeVI(), true); continue; } - - Summary->setLive(true); for (auto Ref : Summary->refs()) visit(Ref, false); if (auto *FS = dyn_cast(Summary.get())) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index d9fb820f7cb53..0b650b3b58f43 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2221,8 +2221,7 @@ isValidCandidateForColdCC(Function &F, BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc); if (!isColdCallSite(CB, CallerBFI)) return false; - auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc); - if (It == AllCallsCold.end()) + if (!llvm::is_contained(AllCallsCold, CallerFunc)) return false; } return true; diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index d0bd0166534a7..cdfd9879c030a 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -101,7 +101,8 @@ bool blockEndsInUnreachable(const BasicBlock &BB) { return !(isa(I) || isa(I)); } -bool unlikelyExecuted(BasicBlock &BB) { +bool unlikelyExecuted(BasicBlock &BB, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { // Exception handling blocks are unlikely executed. if (BB.isEHPad() || isa(BB.getTerminator())) return true; @@ -114,12 +115,19 @@ bool unlikelyExecuted(BasicBlock &BB) { return true; // The block is cold if it has an unreachable terminator, unless it's - // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp). + // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp); + // in the case of a longjmp, if the block is cold according to + // profile information, we mark it as unlikely to be executed as well. if (blockEndsInUnreachable(BB)) { if (auto *CI = dyn_cast_or_null(BB.getTerminator()->getPrevNode())) - if (CI->hasFnAttr(Attribute::NoReturn)) - return false; + if (CI->hasFnAttr(Attribute::NoReturn)) { + if (IntrinsicInst *II = dyn_cast(CI)) + return (II->getIntrinsicID() != Intrinsic::eh_sjlj_longjmp) || + (BFI && PSI->isColdBlock(&BB, BFI)); + return !CI->getCalledFunction()->getName().contains("longjmp") || + (BFI && PSI->isColdBlock(&BB, BFI)); + } return true; } @@ -575,7 +583,7 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { continue; bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) || - (EnableStaticAnalyis && unlikelyExecuted(*BB)); + (EnableStaticAnalyis && unlikelyExecuted(*BB, PSI, BFI)); if (!Cold) continue; diff --git a/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp deleted file mode 100644 index 8d05a72d68dac..0000000000000 --- a/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp +++ /dev/null @@ -1,308 +0,0 @@ -//===-- IPConstantPropagation.cpp - Propagate constants through calls -----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass implements an _extremely_ simple interprocedural constant -// propagation pass. It could certainly be improved in many different ways, -// like using a worklist. This pass makes arguments dead, but does not remove -// them. The existing dead argument elimination pass should be run after this -// to clean up the mess. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/AbstractCallSite.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Transforms/IPO.h" -using namespace llvm; - -#define DEBUG_TYPE "ipconstprop" - -STATISTIC(NumArgumentsProped, "Number of args turned into constants"); -STATISTIC(NumReturnValProped, "Number of return values turned into constants"); - -namespace { - /// IPCP - The interprocedural constant propagation pass - /// - struct IPCP : public ModulePass { - static char ID; // Pass identification, replacement for typeid - IPCP() : ModulePass(ID) { - initializeIPCPPass(*PassRegistry::getPassRegistry()); - } - - bool runOnModule(Module &M) override; - }; -} - -/// PropagateConstantsIntoArguments - Look at all uses of the specified -/// function. If all uses are direct call sites, and all pass a particular -/// constant in for an argument, propagate that constant in as the argument. -/// -static bool PropagateConstantsIntoArguments(Function &F) { - if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit. - - // For each argument, keep track of its constant value and whether it is a - // constant or not. The bool is driven to true when found to be non-constant. - SmallVector, 16> ArgumentConstants; - ArgumentConstants.resize(F.arg_size()); - - unsigned NumNonconstant = 0; - for (Use &U : F.uses()) { - User *UR = U.getUser(); - // Ignore blockaddress uses. - if (isa(UR)) continue; - - // If no abstract call site was created we did not understand the use, bail. - AbstractCallSite ACS(&U); - if (!ACS) - return false; - - // Mismatched argument count is undefined behavior. Simply bail out to avoid - // handling of such situations below (avoiding asserts/crashes). - unsigned NumActualArgs = ACS.getNumArgOperands(); - if (F.isVarArg() ? ArgumentConstants.size() > NumActualArgs - : ArgumentConstants.size() != NumActualArgs) - return false; - - // Check out all of the potentially constant arguments. Note that we don't - // inspect varargs here. - Function::arg_iterator Arg = F.arg_begin(); - for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++Arg) { - - // If this argument is known non-constant, ignore it. - if (ArgumentConstants[i].getInt()) - continue; - - Value *V = ACS.getCallArgOperand(i); - Constant *C = dyn_cast_or_null(V); - - // Mismatched argument type is undefined behavior. Simply bail out to avoid - // handling of such situations below (avoiding asserts/crashes). - if (C && Arg->getType() != C->getType()) - return false; - - // We can only propagate thread independent values through callbacks. - // This is different to direct/indirect call sites because for them we - // know the thread executing the caller and callee is the same. For - // callbacks this is not guaranteed, thus a thread dependent value could - // be different for the caller and callee, making it invalid to propagate. - if (C && ACS.isCallbackCall() && C->isThreadDependent()) { - // Argument became non-constant. If all arguments are non-constant now, - // give up on this function. - if (++NumNonconstant == ArgumentConstants.size()) - return false; - - ArgumentConstants[i].setInt(true); - continue; - } - - if (C && ArgumentConstants[i].getPointer() == nullptr) { - ArgumentConstants[i].setPointer(C); // First constant seen. - } else if (C && ArgumentConstants[i].getPointer() == C) { - // Still the constant value we think it is. - } else if (V == &*Arg) { - // Ignore recursive calls passing argument down. - } else { - // Argument became non-constant. If all arguments are non-constant now, - // give up on this function. - if (++NumNonconstant == ArgumentConstants.size()) - return false; - ArgumentConstants[i].setInt(true); - } - } - } - - // If we got to this point, there is a constant argument! - assert(NumNonconstant != ArgumentConstants.size()); - bool MadeChange = false; - Function::arg_iterator AI = F.arg_begin(); - for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) { - // Do we have a constant argument? - if (ArgumentConstants[i].getInt() || AI->use_empty() || - (AI->hasByValAttr() && !F.onlyReadsMemory())) - continue; - - Value *V = ArgumentConstants[i].getPointer(); - if (!V) V = UndefValue::get(AI->getType()); - AI->replaceAllUsesWith(V); - ++NumArgumentsProped; - MadeChange = true; - } - return MadeChange; -} - - -// Check to see if this function returns one or more constants. If so, replace -// all callers that use those return values with the constant value. This will -// leave in the actual return values and instructions, but deadargelim will -// clean that up. -// -// Additionally if a function always returns one of its arguments directly, -// callers will be updated to use the value they pass in directly instead of -// using the return value. -static bool PropagateConstantReturn(Function &F) { - if (F.getReturnType()->isVoidTy()) - return false; // No return value. - - // We can infer and propagate the return value only when we know that the - // definition we'll get at link time is *exactly* the definition we see now. - // For more details, see GlobalValue::mayBeDerefined. - if (!F.isDefinitionExact()) - return false; - - // Don't touch naked functions. The may contain asm returning - // value we don't see, so we may end up interprocedurally propagating - // the return value incorrectly. - if (F.hasFnAttribute(Attribute::Naked)) - return false; - - // Check to see if this function returns a constant. - SmallVector RetVals; - StructType *STy = dyn_cast(F.getReturnType()); - if (STy) - for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) - RetVals.push_back(UndefValue::get(STy->getElementType(i))); - else - RetVals.push_back(UndefValue::get(F.getReturnType())); - - unsigned NumNonConstant = 0; - for (BasicBlock &BB : F) - if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { - for (unsigned i = 0, e = RetVals.size(); i != e; ++i) { - // Already found conflicting return values? - Value *RV = RetVals[i]; - if (!RV) - continue; - - // Find the returned value - Value *V; - if (!STy) - V = RI->getOperand(0); - else - V = FindInsertedValue(RI->getOperand(0), i); - - if (V) { - // Ignore undefs, we can change them into anything - if (isa(V)) - continue; - - // Try to see if all the rets return the same constant or argument. - if (isa(V) || isa(V)) { - if (isa(RV)) { - // No value found yet? Try the current one. - RetVals[i] = V; - continue; - } - // Returning the same value? Good. - if (RV == V) - continue; - } - } - // Different or no known return value? Don't propagate this return - // value. - RetVals[i] = nullptr; - // All values non-constant? Stop looking. - if (++NumNonConstant == RetVals.size()) - return false; - } - } - - // If we got here, the function returns at least one constant value. Loop - // over all users, replacing any uses of the return value with the returned - // constant. - bool MadeChange = false; - for (Use &U : F.uses()) { - CallBase *CB = dyn_cast(U.getUser()); - - // Not a call instruction or a call instruction that's not calling F - // directly? - if (!CB || !CB->isCallee(&U)) - continue; - - // Call result not used? - if (CB->use_empty()) - continue; - - MadeChange = true; - - if (!STy) { - Value* New = RetVals[0]; - if (Argument *A = dyn_cast(New)) - // Was an argument returned? Then find the corresponding argument in - // the call instruction and use that. - New = CB->getArgOperand(A->getArgNo()); - CB->replaceAllUsesWith(New); - continue; - } - - for (auto I = CB->user_begin(), E = CB->user_end(); I != E;) { - Instruction *Ins = cast(*I); - - // Increment now, so we can remove the use - ++I; - - // Find the index of the retval to replace with - int index = -1; - if (ExtractValueInst *EV = dyn_cast(Ins)) - if (EV->getNumIndices() == 1) - index = *EV->idx_begin(); - - // If this use uses a specific return value, and we have a replacement, - // replace it. - if (index != -1) { - Value *New = RetVals[index]; - if (New) { - if (Argument *A = dyn_cast(New)) - // Was an argument returned? Then find the corresponding argument in - // the call instruction and use that. - New = CB->getArgOperand(A->getArgNo()); - Ins->replaceAllUsesWith(New); - Ins->eraseFromParent(); - } - } - } - } - - if (MadeChange) ++NumReturnValProped; - return MadeChange; -} - -char IPCP::ID = 0; -INITIALIZE_PASS(IPCP, "ipconstprop", - "Interprocedural constant propagation", false, false) - -ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); } - -bool IPCP::runOnModule(Module &M) { - if (skipModule(M)) - return false; - - bool Changed = false; - bool LocalChange = true; - - // FIXME: instead of using smart algorithms, we just iterate until we stop - // making changes. - while (LocalChange) { - LocalChange = false; - for (Function &F : M) - if (!F.isDeclaration()) { - // Delete any klingons. - F.removeDeadConstantUsers(); - if (F.hasLocalLinkage()) - LocalChange |= PropagateConstantsIntoArguments(F); - Changed |= PropagateConstantReturn(F); - } - Changed |= LocalChange; - } - return Changed; -} diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp index d37b9236380d4..45aca100086d4 100644 --- a/llvm/lib/Transforms/IPO/IPO.cpp +++ b/llvm/lib/Transforms/IPO/IPO.cpp @@ -35,7 +35,6 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeGlobalOptLegacyPassPass(Registry); initializeGlobalSplitPass(Registry); initializeHotColdSplittingLegacyPassPass(Registry); - initializeIPCPPass(Registry); initializeAlwaysInlinerLegacyPassPass(Registry); initializeSimpleInlinerPass(Registry); initializeInferFunctionAttrsLegacyPassPass(Registry); @@ -104,10 +103,6 @@ void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createGlobalOptimizerPass()); } -void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createIPConstantPropagationPass()); -} - void LLVMAddPruneEHPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createPruneEHPass()); } diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index f664a24173747..93f1e5392eb2c 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1339,10 +1339,21 @@ PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, return PreservedAnalyses::all(); SmallVector SCC; - for (LazyCallGraph::Node &N : C) - SCC.push_back(&N.getFunction()); + // If there are kernels in the module, we have to run on all SCC's. + bool SCCIsInteresting = !OMPInModule.getKernels().empty(); + for (LazyCallGraph::Node &N : C) { + Function *Fn = &N.getFunction(); + SCC.push_back(Fn); + + // Do we already know that the SCC contains kernels, + // or that OpenMP functions are called from this SCC? + if (SCCIsInteresting) + continue; + // If not, let's check that. + SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); + } - if (SCC.empty()) + if (!SCCIsInteresting || SCC.empty()) return PreservedAnalyses::all(); FunctionAnalysisManager &FAM = @@ -1401,12 +1412,23 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass { return false; SmallVector SCC; - for (CallGraphNode *CGN : CGSCC) - if (Function *Fn = CGN->getFunction()) - if (!Fn->isDeclaration()) - SCC.push_back(Fn); + // If there are kernels in the module, we have to run on all SCC's. + bool SCCIsInteresting = !OMPInModule.getKernels().empty(); + for (CallGraphNode *CGN : CGSCC) { + Function *Fn = CGN->getFunction(); + if (!Fn || Fn->isDeclaration()) + continue; + SCC.push_back(Fn); - if (SCC.empty()) + // Do we already know that the SCC contains kernels, + // or that OpenMP functions are called from this SCC? + if (SCCIsInteresting) + continue; + // If not, let's check that. + SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn); + } + + if (!SCCIsInteresting || SCC.empty()) return false; CallGraph &CG = getAnalysis().getCallGraph(); @@ -1468,13 +1490,19 @@ bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) { if (OMPInModule.isKnown()) return OMPInModule; + auto RecordFunctionsContainingUsesOf = [&](Function *F) { + for (User *U : F->users()) + if (auto *I = dyn_cast(U)) + OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction()); + }; + // MSVC doesn't like long if-else chains for some reason and instead just // issues an error. Work around it.. do { #define OMP_RTL(_Enum, _Name, ...) \ - if (M.getFunction(_Name)) { \ + if (Function *F = M.getFunction(_Name)) { \ + RecordFunctionsContainingUsesOf(F); \ OMPInModule = true; \ - break; \ } #include "llvm/Frontend/OpenMP/OMPKinds.def" } while (false); diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 5d863f1330a44..e1dc036ae413c 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -1181,8 +1181,7 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { // (i.e. not to be extracted to the out of line function) auto ToBeInlined = [&, this](BasicBlock *BB) { return BB == ClonedOI->ReturnBlock || - (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != - ClonedOI->Entries.end()); + llvm::is_contained(ClonedOI->Entries, BB); }; assert(ClonedOI && "Expecting OutlineInfo for single region outline"); diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index c045c277706b3..22daa8e812f69 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -784,10 +784,13 @@ void PassManagerBuilder::populateModulePassManager( // convert to more optimized IR using more aggressive simplify CFG options. // The extra sinking transform can create larger basic blocks, so do this // before SLP vectorization. + // FIXME: study whether hoisting and/or sinking of common instructions should + // be delayed until after SLP vectorizer. MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() .forwardSwitchCondToPhi(true) .convertSwitchToLookupTable(true) .needCanonicalLoops(false) + .hoistCommonInsts(true) .sinkCommonInsts(true))); if (SLPVectorize) { diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index b6871e260532d..5978f2dc6d1dd 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -995,6 +995,8 @@ bool SampleProfileLoader::inlineHotFunctions( const FunctionSamples *FS = nullptr; if (auto *CB = dyn_cast(&I)) { if (!isa(I) && (FS = findCalleeFunctionSamples(*CB))) { + assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && + "GUIDToFuncNameMap has to be populated"); AllCandidates.push_back(CB); if (FS->getEntrySamples() > 0) localNotInlinedCallSites.try_emplace(CB, FS); @@ -1104,16 +1106,26 @@ bool SampleProfileLoader::inlineHotFunctions( } if (ProfileMergeInlinee) { - // Use entry samples as head samples during the merge, as inlinees - // don't have head samples. - assert(FS->getHeadSamples() == 0 && "Expect 0 head sample for inlinee"); - const_cast(FS)->addHeadSamples(FS->getEntrySamples()); - - // Note that we have to do the merge right after processing function. - // This allows OutlineFS's profile to be used for annotation during - // top-down processing of functions' annotation. - FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); - OutlineFS->merge(*FS); + // A function call can be replicated by optimizations like callsite + // splitting or jump threading and the replicates end up sharing the + // sample nested callee profile instead of slicing the original inlinee's + // profile. We want to do merge exactly once by filtering out callee + // profiles with a non-zero head sample count. + if (FS->getHeadSamples() == 0) { + // Use entry samples as head samples during the merge, as inlinees + // don't have head samples. + const_cast(FS)->addHeadSamples( + FS->getEntrySamples()); + + // Note that we have to do the merge right after processing function. + // This allows OutlineFS's profile to be used for annotation during + // top-down processing of functions' annotation. + FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); + OutlineFS->merge(*FS); + } else + assert(FS->getHeadSamples() == FS->getEntrySamples() && + "Expect same head and entry sample counts for profiles already " + "merged."); } else { auto pair = notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 030d2f203ed6c..9d7effc724beb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3351,6 +3351,21 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { match(Op1, m_Not(m_Specific(A)))) return BinaryOperator::CreateNot(Builder.CreateAnd(A, B)); + // (A | B) ^ (A | C) --> (B ^ C) & ~A -- There are 4 commuted variants. + // TODO: Loosen one-use restriction if common operand is a constant. + Value *D; + if (match(Op0, m_OneUse(m_Or(m_Value(A), m_Value(B)))) && + match(Op1, m_OneUse(m_Or(m_Value(C), m_Value(D))))) { + if (B == C || B == D) + std::swap(A, B); + if (A == C) + std::swap(C, D); + if (A == D) { + Value *NotA = Builder.CreateNot(A); + return BinaryOperator::CreateAnd(Builder.CreateXor(B, C), NotA); + } + } + if (auto *LHS = dyn_cast(I.getOperand(0))) if (auto *RHS = dyn_cast(I.getOperand(1))) if (Value *V = foldXorOfICmps(LHS, RHS, I)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index ed93c33c1a597..4eb3e2e4434fd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -769,6 +769,16 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) return replaceInstUsesWith(CI, V); return nullptr; + case Intrinsic::abs: { + Value *IIOperand = II->getArgOperand(0); + // abs(-x) -> abs(x) + // TODO: Copy nsw if it was present on the neg? + Value *X; + if (match(IIOperand, m_Neg(m_Value(X)))) + return replaceOperand(*II, 0, X); + + break; + } case Intrinsic::bswap: { Value *IIOperand = II->getArgOperand(0); Value *X = nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 2b8732fdb80d7..ca2bcb9bc4151 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -5643,10 +5643,10 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) { // Try to optimize equality comparisons against alloca-based pointers. if (Op0->getType()->isPointerTy() && I.isEquality()) { assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?"); - if (auto *Alloca = dyn_cast(GetUnderlyingObject(Op0, DL))) + if (auto *Alloca = dyn_cast(getUnderlyingObject(Op0))) if (Instruction *New = foldAllocaCmp(I, Alloca, Op1)) return New; - if (auto *Alloca = dyn_cast(GetUnderlyingObject(Op1, DL))) + if (auto *Alloca = dyn_cast(getUnderlyingObject(Op1))) if (Instruction *New = foldAllocaCmp(I, Alloca, Op0)) return New; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index e533198bd489d..c14a330542c9e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -2849,8 +2849,9 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { return replaceOperand(SI, 1, TrueSI->getTrueValue()); } // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b) - // We choose this as normal form to enable folding on the And and shortening - // paths for the values (this helps GetUnderlyingObjects() for example). + // We choose this as normal form to enable folding on the And and + // shortening paths for the values (this helps getUnderlyingObjects() for + // example). if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) { Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition()); replaceOperand(SI, 0, And); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index fdf0aaf9b176d..ceeddda05fd9a 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1048,7 +1048,9 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { BasicBlock *NonConstBB = nullptr; for (unsigned i = 0; i != NumPHIValues; ++i) { Value *InVal = PN->getIncomingValue(i); - if (isa(InVal) && !isa(InVal)) + // If I is a freeze instruction, count undef as a non-constant. + if (isa(InVal) && !isa(InVal) && + (!isa(I) || isGuaranteedNotToBeUndefOrPoison(InVal))) continue; if (isa(InVal)) return nullptr; // Itself a phi. @@ -1141,6 +1143,15 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { Builder); NewPN->addIncoming(InV, PN->getIncomingBlock(i)); } + } else if (isa(&I)) { + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InV; + if (NonConstBB == PN->getIncomingBlock(i)) + InV = Builder.CreateFreeze(PN->getIncomingValue(i), "phi.fr"); + else + InV = PN->getIncomingValue(i); + NewPN->addIncoming(InV, PN->getIncomingBlock(i)); + } } else { CastInst *CI = cast(&I); Type *RetTy = CI->getType(); @@ -1588,7 +1599,7 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { Constant *C; if (match(&Inst, m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))), - m_Constant(C))) && + m_Constant(C))) && !isa(C) && cast(V1->getType())->getNumElements() <= NumElts) { assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() && "Shuffle should not change scalar type"); @@ -3370,6 +3381,12 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); + // freeze (phi const, x) --> phi const, (freeze x) + if (auto *PN = dyn_cast(Op0)) { + if (Instruction *NV = foldOpIntoPhi(I, PN)) + return NV; + } + return nullptr; } diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 647d25e6a24e0..85ab19c20e84d 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -911,10 +911,6 @@ struct FunctionStackPoisoner : public InstVisitor { AllocaInst *DynamicAllocaLayout = nullptr; IntrinsicInst *LocalEscapeCall = nullptr; - // Maps Value to an AllocaInst from which the Value is originated. - using AllocaForValueMapTy = DenseMap; - AllocaForValueMapTy AllocaForValue; - bool HasInlineAsm = false; bool HasReturnsTwiceCall = false; @@ -1065,8 +1061,7 @@ struct FunctionStackPoisoner : public InstVisitor { !ConstantInt::isValueValidForType(IntptrTy, SizeValue)) return; // Find alloca instruction that corresponds to llvm.lifetime argument. - AllocaInst *AI = - llvm::findAllocaForValue(II.getArgOperand(1), AllocaForValue); + AllocaInst *AI = findAllocaForValue(II.getArgOperand(1)); if (!AI) { HasUntracedLifetimeIntrinsic = true; return; @@ -1561,7 +1556,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, if (ClOpt && ClOptGlobals) { // If initialization order checking is disabled, a simple access to a // dynamically initialized global is always valid. - GlobalVariable *G = dyn_cast(GetUnderlyingObject(Addr, DL)); + GlobalVariable *G = dyn_cast(getUnderlyingObject(Addr)); if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) && isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) { NumOptimizedAccessesToGlobalVar++; @@ -1571,7 +1566,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, if (ClOpt && ClOptStack) { // A direct inbounds access to a stack variable is always valid. - if (isa(GetUnderlyingObject(Addr, DL)) && + if (isa(getUnderlyingObject(Addr)) && isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) { NumOptimizedAccessesToStackVar++; return; diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 2846319007318..523e7b19ecb1d 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -46,6 +46,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" @@ -78,6 +79,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -176,6 +178,14 @@ static cl::opt ClEventCallbacks( cl::desc("Insert calls to __dfsan_*_callback functions on data events."), cl::Hidden, cl::init(false)); +// Use a distinct bit for each base label, enabling faster unions with less +// instrumentation. Limits the max number of base labels to 16. +static cl::opt ClFast16Labels( + "dfsan-fast-16-labels", + cl::desc("Use more efficient instrumentation, limiting the number of " + "labels to 16."), + cl::Hidden, cl::init(false)); + static StringRef GetGlobalTypeString(const GlobalValue &G) { // Types of GlobalVariables are always pointer types. Type *GType = G.getValueType(); @@ -292,7 +302,7 @@ AttributeList TransformFunctionAttributes( llvm::makeArrayRef(ArgumentAttributes)); } -class DataFlowSanitizer : public ModulePass { +class DataFlowSanitizer { friend struct DFSanFunction; friend class DFSanVisitor; @@ -344,8 +354,6 @@ class DataFlowSanitizer : public ModulePass { ConstantInt *ShadowPtrMul; Constant *ArgTLS; Constant *RetvalTLS; - void *(*GetArgTLSPtr)(); - void *(*GetRetvalTLSPtr)(); FunctionType *GetArgTLSTy; FunctionType *GetRetvalTLSTy; Constant *GetArgTLS; @@ -362,6 +370,7 @@ class DataFlowSanitizer : public ModulePass { FunctionCallee DFSanUnionFn; FunctionCallee DFSanCheckedUnionFn; FunctionCallee DFSanUnionLoadFn; + FunctionCallee DFSanUnionLoadFast16LabelsFn; FunctionCallee DFSanUnimplementedFn; FunctionCallee DFSanSetLabelFn; FunctionCallee DFSanNonzeroLabelFn; @@ -392,15 +401,12 @@ class DataFlowSanitizer : public ModulePass { void initializeCallbackFunctions(Module &M); void initializeRuntimeFunctions(Module &M); -public: - static char ID; + bool init(Module &M); - DataFlowSanitizer( - const std::vector &ABIListFiles = std::vector(), - void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr); +public: + DataFlowSanitizer(const std::vector &ABIListFiles); - bool doInitialization(Module &M) override; - bool runOnModule(Module &M) override; + bool runImpl(Module &M); }; struct DFSanFunction { @@ -485,22 +491,8 @@ class DFSanVisitor : public InstVisitor { } // end anonymous namespace -char DataFlowSanitizer::ID; - -INITIALIZE_PASS(DataFlowSanitizer, "dfsan", - "DataFlowSanitizer: dynamic data flow analysis.", false, false) - -ModulePass * -llvm::createDataFlowSanitizerPass(const std::vector &ABIListFiles, - void *(*getArgTLS)(), - void *(*getRetValTLS)()) { - return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS); -} - DataFlowSanitizer::DataFlowSanitizer( - const std::vector &ABIListFiles, void *(*getArgTLS)(), - void *(*getRetValTLS)()) - : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) { + const std::vector &ABIListFiles) { std::vector AllABIListFiles(std::move(ABIListFiles)); AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), ClABIListFiles.end()); @@ -565,7 +557,7 @@ TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) { ArgumentIndexMapping); } -bool DataFlowSanitizer::doInitialization(Module &M) { +bool DataFlowSanitizer::init(Module &M) { Triple TargetTriple(M.getTargetTriple()); bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64; bool IsMIPS64 = TargetTriple.isMIPS64(); @@ -613,22 +605,6 @@ bool DataFlowSanitizer::doInitialization(Module &M) { FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs, /*isVarArg=*/false); - if (GetArgTLSPtr) { - Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); - ArgTLS = nullptr; - GetArgTLSTy = FunctionType::get(PointerType::getUnqual(ArgTLSTy), false); - GetArgTLS = ConstantExpr::getIntToPtr( - ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)), - PointerType::getUnqual(GetArgTLSTy)); - } - if (GetRetvalTLSPtr) { - RetvalTLS = nullptr; - GetRetvalTLSTy = FunctionType::get(PointerType::getUnqual(ShadowTy), false); - GetRetvalTLS = ConstantExpr::getIntToPtr( - ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)), - PointerType::getUnqual(GetRetvalTLSTy)); - } - ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); return true; } @@ -781,6 +757,17 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) { DFSanUnionLoadFn = Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL); } + { + AttributeList AL; + AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex, + Attribute::NoUnwind); + AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex, + Attribute::ReadOnly); + AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex, + Attribute::ZExt); + DFSanUnionLoadFast16LabelsFn = Mod->getOrInsertFunction( + "__dfsan_union_load_fast16labels", DFSanUnionLoadFnTy, AL); + } DFSanUnimplementedFn = Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy); { @@ -807,7 +794,9 @@ void DataFlowSanitizer::initializeCallbackFunctions(Module &M) { DFSanLoadStoreCmpCallbackFnTy); } -bool DataFlowSanitizer::runOnModule(Module &M) { +bool DataFlowSanitizer::runImpl(Module &M) { + init(M); + if (ABIList.isIn(M, "skip")) return false; @@ -816,20 +805,16 @@ bool DataFlowSanitizer::runOnModule(Module &M) { bool Changed = false; - if (!GetArgTLSPtr) { - Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); - ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); - if (GlobalVariable *G = dyn_cast(ArgTLS)) { - Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel; - G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); - } + Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); + ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); + if (GlobalVariable *G = dyn_cast(ArgTLS)) { + Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel; + G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); } - if (!GetRetvalTLSPtr) { - RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy); - if (GlobalVariable *G = dyn_cast(RetvalTLS)) { - Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel; - G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); - } + RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy); + if (GlobalVariable *G = dyn_cast(RetvalTLS)) { + Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel; + G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); } ExternalShadowMask = @@ -845,6 +830,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { &i != DFSanUnionFn.getCallee()->stripPointerCasts() && &i != DFSanCheckedUnionFn.getCallee()->stripPointerCasts() && &i != DFSanUnionLoadFn.getCallee()->stripPointerCasts() && + &i != DFSanUnionLoadFast16LabelsFn.getCallee()->stripPointerCasts() && &i != DFSanUnimplementedFn.getCallee()->stripPointerCasts() && &i != DFSanSetLabelFn.getCallee()->stripPointerCasts() && &i != DFSanNonzeroLabelFn.getCallee()->stripPointerCasts() && @@ -1179,7 +1165,10 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { return CCS.Shadow; IRBuilder<> IRB(Pos); - if (AvoidNewBlocks) { + if (ClFast16Labels) { + CCS.Block = Pos->getParent(); + CCS.Shadow = IRB.CreateOr(V1, V2); + } else if (AvoidNewBlocks) { CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2}); Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt); Call->addParamAttr(0, Attribute::ZExt); @@ -1257,7 +1246,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, const llvm::Align ShadowAlign(Align * DFS.ShadowWidthBytes); SmallVector Objs; - GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout()); + getUnderlyingObjects(Addr, Objs); bool AllConstants = true; for (const Value *Obj : Objs) { if (isa(Obj) || isa(Obj)) @@ -1289,6 +1278,30 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, IRB.CreateAlignedLoad(DFS.ShadowTy, ShadowAddr1, ShadowAlign), Pos); } } + + if (ClFast16Labels && Size % (64 / DFS.ShadowWidthBits) == 0) { + // First OR all the WideShadows, then OR individual shadows within the + // combined WideShadow. This is fewer instructions than ORing shadows + // individually. + IRBuilder<> IRB(Pos); + Value *WideAddr = + IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx)); + Value *CombinedWideShadow = + IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign); + for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size; + Ofs += 64 / DFS.ShadowWidthBits) { + WideAddr = IRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr, + ConstantInt::get(DFS.IntptrTy, 1)); + Value *NextWideShadow = + IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign); + CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, NextWideShadow); + } + for (unsigned Width = 32; Width >= DFS.ShadowWidthBits; Width >>= 1) { + Value *ShrShadow = IRB.CreateLShr(CombinedWideShadow, Width); + CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, ShrShadow); + } + return IRB.CreateTrunc(CombinedWideShadow, DFS.ShadowTy); + } if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0) { // Fast path for the common case where each byte has identical shadow: load // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any @@ -1355,8 +1368,10 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, } IRBuilder<> IRB(Pos); + FunctionCallee &UnionLoadFn = + ClFast16Labels ? DFS.DFSanUnionLoadFast16LabelsFn : DFS.DFSanUnionLoadFn; CallInst *FallbackCall = IRB.CreateCall( - DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)}); + UnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)}); FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt); return FallbackCall; } @@ -1843,3 +1858,39 @@ void DFSanVisitor::visitPHINode(PHINode &PN) { DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN)); DFSF.setShadow(&PN, ShadowPN); } + +namespace { +class DataFlowSanitizerLegacyPass : public ModulePass { +private: + std::vector ABIListFiles; + +public: + static char ID; + + DataFlowSanitizerLegacyPass( + const std::vector &ABIListFiles = std::vector()) + : ModulePass(ID), ABIListFiles(ABIListFiles) {} + + bool runOnModule(Module &M) override { + return DataFlowSanitizer(ABIListFiles).runImpl(M); + } +}; +} // namespace + +char DataFlowSanitizerLegacyPass::ID; + +INITIALIZE_PASS(DataFlowSanitizerLegacyPass, "dfsan", + "DataFlowSanitizer: dynamic data flow analysis.", false, false) + +ModulePass *llvm::createDataFlowSanitizerLegacyPassPass( + const std::vector &ABIListFiles) { + return new DataFlowSanitizerLegacyPass(ABIListFiles); +} + +PreservedAnalyses DataFlowSanitizerPass::run(Module &M, + ModuleAnalysisManager &AM) { + if (DataFlowSanitizer(ABIListFiles).runImpl(M)) { + return PreservedAnalyses::none(); + } + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index cd2ea8d5e4edd..53a89f7348de4 100644 --- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CRC.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" @@ -300,15 +301,16 @@ namespace { assert(OutEdges.empty()); } + uint32_t Number; + SmallVector OutEdges; + private: friend class GCOVFunction; GCOVBlock(GCOVProfiler *P, uint32_t Number) : GCOVRecord(P), Number(Number) {} - uint32_t Number; StringMap LinesByFile; - SmallVector OutEdges; }; // A function has a unique identifier, a checksum (we leave as zero) and a @@ -347,18 +349,6 @@ namespace { return ReturnBlock; } - std::string getEdgeDestinations() { - std::string EdgeDestinations; - raw_string_ostream EDOS(EdgeDestinations); - Function *F = Blocks.begin()->first->getParent(); - for (BasicBlock &I : *F) { - GCOVBlock &Block = getBlock(&I); - for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) - EDOS << Block.OutEdges[i]->Number; - } - return EdgeDestinations; - } - uint32_t getFuncChecksum() const { return FuncChecksum; } @@ -729,7 +719,7 @@ void GCOVProfiler::emitProfileNotes() { continue; } - std::string EdgeDestinations; + std::vector EdgeDestinations; Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little : support::endianness::big; @@ -774,6 +764,11 @@ void GCOVProfiler::emitProfileNotes() { } else if (isa(TI)) { Block.addEdge(Func.getReturnBlock()); } + for (GCOVBlock *Succ : Block.OutEdges) { + uint32_t Idx = Succ->Number; + do EdgeDestinations.push_back(Idx & 255); + while ((Idx >>= 8) > 0); + } for (auto &I : BB) { // Debug intrinsic locations correspond to the location of the @@ -798,12 +793,13 @@ void GCOVProfiler::emitProfileNotes() { } Line = 0; } - EdgeDestinations += Func.getEdgeDestinations(); } char Tmp[4]; + JamCRC JC; + JC.update(EdgeDestinations); os = &out; - auto Stamp = static_cast(hash_value(EdgeDestinations)); + uint32_t Stamp = JC.getCRC(); FileChecksums.push_back(Stamp); if (Endian == support::endianness::big) { out.write("gcno", 4); diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 7b03bbfcdfe4b..42939c8dbc017 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -57,6 +57,8 @@ using namespace llvm; #define DEBUG_TYPE "instrprof" +// FIXME: These are to be removed after switching to the new memop value +// profiling. // The start and end values of precise value profile range for memory // intrinsic sizes cl::opt MemOPSizeRange( @@ -72,6 +74,12 @@ cl::opt MemOPSizeLarge( "Value of 0 disables the large value profiling."), cl::init(8192)); +cl::opt UseOldMemOpValueProf( + "use-old-memop-value-prof", + cl::desc("Use the old memop value profiling buckets. This is " + "transitional and to be removed after switching. "), + cl::init(false)); + namespace { cl::opt DoHashBasedCounterSplit( @@ -150,6 +158,10 @@ cl::opt IterativeCounterPromotion( cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true), cl::desc("Allow counter promotion across the whole loop nest.")); +cl::opt SkipRetExitBlock( + cl::ZeroOrMore, "skip-ret-exit-block", cl::init(true), + cl::desc("Suppress counter promotion if exit blocks contain ret.")); + class InstrProfilingLegacyPass : public ModulePass { InstrProfiling InstrProf; @@ -272,6 +284,18 @@ class PGOCounterPromoter { // Skip 'infinite' loops: if (ExitBlocks.size() == 0) return false; + + // Skip if any of the ExitBlocks contains a ret instruction. + // This is to prevent dumping of incomplete profile -- if the + // the loop is a long running loop and dump is called in the middle + // of the loop, the result profile is incomplete. + // FIXME: add other heuristics to detect long running loops. + if (SkipRetExitBlock) { + for (auto BB : ExitBlocks) + if (dyn_cast(BB->getTerminator()) != nullptr) + return false; + } + unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L); if (MaxProm == 0) return false; @@ -395,6 +419,19 @@ class PGOCounterPromoter { BlockFrequencyInfo *BFI; }; +enum class ValueProfilingCallType { + // Individual values are tracked. Currently used for indiret call target + // profiling. + Default, + + // The old memop size value profiling. FIXME: To be removed after switching to + // the new one. + OldMemOp, + + // MemOp: the (new) memop size value profiling with extended buckets. + MemOp +}; + } // end anonymous namespace PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) { @@ -579,9 +616,9 @@ bool InstrProfiling::run( return true; } -static FunctionCallee -getOrInsertValueProfilingCall(Module &M, const TargetLibraryInfo &TLI, - bool IsRange = false) { +static FunctionCallee getOrInsertValueProfilingCall( + Module &M, const TargetLibraryInfo &TLI, + ValueProfilingCallType CallType = ValueProfilingCallType::Default) { LLVMContext &Ctx = M.getContext(); auto *ReturnTy = Type::getVoidTy(M.getContext()); @@ -589,16 +626,22 @@ getOrInsertValueProfilingCall(Module &M, const TargetLibraryInfo &TLI, if (auto AK = TLI.getExtAttrForI32Param(false)) AL = AL.addParamAttribute(M.getContext(), 2, AK); - if (!IsRange) { + if (CallType == ValueProfilingCallType::Default || + CallType == ValueProfilingCallType::MemOp) { Type *ParamTypes[] = { #define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType #include "llvm/ProfileData/InstrProfData.inc" }; auto *ValueProfilingCallTy = FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false); - return M.getOrInsertFunction(getInstrProfValueProfFuncName(), - ValueProfilingCallTy, AL); + StringRef FuncName = CallType == ValueProfilingCallType::Default + ? getInstrProfValueProfFuncName() + : getInstrProfValueProfMemOpFuncName(); + return M.getOrInsertFunction(FuncName, ValueProfilingCallTy, AL); } else { + // FIXME: This code is to be removed after switching to the new memop value + // profiling. + assert(CallType == ValueProfilingCallType::OldMemOp); Type *RangeParamTypes[] = { #define VALUE_RANGE_PROF 1 #define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType @@ -638,8 +681,8 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { Index += It->second.NumValueSites[Kind]; IRBuilder<> Builder(Ind); - bool IsRange = (Ind->getValueKind()->getZExtValue() == - llvm::InstrProfValueKind::IPVK_MemOPSize); + bool IsMemOpSize = (Ind->getValueKind()->getZExtValue() == + llvm::InstrProfValueKind::IPVK_MemOPSize); CallInst *Call = nullptr; auto *TLI = &GetTLI(*Ind->getFunction()); @@ -649,12 +692,19 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { // WinEHPrepare pass. SmallVector OpBundles; Ind->getOperandBundlesAsDefs(OpBundles); - if (!IsRange) { + if (!IsMemOpSize) { Value *Args[3] = {Ind->getTargetValue(), Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()), Builder.getInt32(Index)}; Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args, OpBundles); + } else if (!UseOldMemOpValueProf) { + Value *Args[3] = {Ind->getTargetValue(), + Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()), + Builder.getInt32(Index)}; + Call = Builder.CreateCall( + getOrInsertValueProfilingCall(*M, *TLI, ValueProfilingCallType::MemOp), + Args, OpBundles); } else { Value *Args[6] = { Ind->getTargetValue(), @@ -663,7 +713,8 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { Builder.getInt64(MemOPSizeRangeStart), Builder.getInt64(MemOPSizeRangeLast), Builder.getInt64(MemOPSizeLarge == 0 ? INT64_MIN : MemOPSizeLarge)}; - Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true), + Call = Builder.CreateCall(getOrInsertValueProfilingCall( + *M, *TLI, ValueProfilingCallType::OldMemOp), Args, OpBundles); } if (auto AK = TLI->getExtAttrForI32Param(false)) @@ -831,9 +882,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { Visibility = GlobalValue::HiddenVisibility; } } + std::string DataVarName = getVarName(Inc, getInstrProfDataVarPrefix()); auto MaybeSetComdat = [=](GlobalVariable *GV) { if (NeedComdat) - GV->setComdat(M->getOrInsertComdat(GV->getName())); + GV->setComdat(M->getOrInsertComdat(TT.isOSBinFormatCOFF() ? GV->getName() + : DataVarName)); }; uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); @@ -898,9 +951,9 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init, #include "llvm/ProfileData/InstrProfData.inc" }; - auto *Data = new GlobalVariable(*M, DataTy, false, Linkage, - ConstantStruct::get(DataTy, DataVals), - getVarName(Inc, getInstrProfDataVarPrefix())); + auto *Data = + new GlobalVariable(*M, DataTy, false, Linkage, + ConstantStruct::get(DataTy, DataVals), DataVarName); Data->setVisibility(Visibility); Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat())); Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT)); diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index ad238f1357c69..eda38e7da4f4d 100644 --- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -119,7 +119,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeHWAddressSanitizerLegacyPassPass(Registry); initializeThreadSanitizerLegacyPassPass(Registry); initializeModuleSanitizerCoverageLegacyPassPass(Registry); - initializeDataFlowSanitizerPass(Registry); + initializeDataFlowSanitizerLegacyPassPass(Registry); } /// LLVMInitializeInstrumentation - C binding for diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index f4f62a31d89ee..0f354c1da490a 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -153,6 +153,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -1149,36 +1150,30 @@ struct MemorySanitizerVisitor : public InstVisitor { const DataLayout &DL = F.getParent()->getDataLayout(); const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); - if (Shadow->getType()->isArrayTy()) { - paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, - OriginAlignment); - } else { - Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB); - if (auto *ConstantShadow = dyn_cast(ConvertedShadow)) { - if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) - paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, - OriginAlignment); - return; - } - - unsigned TypeSizeInBits = - DL.getTypeSizeInBits(ConvertedShadow->getType()); - unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); - if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) { - FunctionCallee Fn = MS.MaybeStoreOriginFn[SizeIndex]; - Value *ConvertedShadow2 = IRB.CreateZExt( - ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); - IRB.CreateCall(Fn, {ConvertedShadow2, - IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), - Origin}); - } else { - Value *Cmp = convertToBool(ConvertedShadow, IRB, "_mscmp"); - Instruction *CheckTerm = SplitBlockAndInsertIfThen( - Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights); - IRBuilder<> IRBNew(CheckTerm); - paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize, + Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB); + if (auto *ConstantShadow = dyn_cast(ConvertedShadow)) { + if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) + paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, OriginAlignment); - } + return; + } + + unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType()); + unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits); + if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) { + FunctionCallee Fn = MS.MaybeStoreOriginFn[SizeIndex]; + Value *ConvertedShadow2 = + IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex))); + IRB.CreateCall(Fn, + {ConvertedShadow2, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), Origin}); + } else { + Value *Cmp = convertToBool(ConvertedShadow, IRB, "_mscmp"); + Instruction *CheckTerm = SplitBlockAndInsertIfThen( + Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights); + IRBuilder<> IRBNew(CheckTerm); + paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize, + OriginAlignment); } } @@ -1410,12 +1405,31 @@ struct MemorySanitizerVisitor : public InstVisitor { return Aggregator; } + // Extract combined shadow of array elements + Value *collapseArrayShadow(ArrayType *Array, Value *Shadow, + IRBuilder<> &IRB) { + if (!Array->getNumElements()) + return IRB.getIntN(/* width */ 1, /* value */ 0); + + Value *FirstItem = IRB.CreateExtractValue(Shadow, 0); + Value *Aggregator = convertShadowToScalar(FirstItem, IRB); + + for (unsigned Idx = 1; Idx < Array->getNumElements(); Idx++) { + Value *ShadowItem = IRB.CreateExtractValue(Shadow, Idx); + Value *ShadowInner = convertShadowToScalar(ShadowItem, IRB); + Aggregator = IRB.CreateOr(Aggregator, ShadowInner); + } + return Aggregator; + } + /// Convert a shadow value to it's flattened variant. The resulting /// shadow may not necessarily have the same bit width as the input /// value, but it will always be comparable to zero. Value *convertShadowToScalar(Value *V, IRBuilder<> &IRB) { if (StructType *Struct = dyn_cast(V->getType())) return collapseStructShadow(Struct, V, IRB); + if (ArrayType *Array = dyn_cast(V->getType())) + return collapseArrayShadow(Array, V, IRB); Type *Ty = V->getType(); Type *NoVecTy = getShadowTyNoVec(Ty); if (Ty == NoVecTy) return V; @@ -1765,10 +1779,10 @@ struct MemorySanitizerVisitor : public InstVisitor { if (!InsertChecks) return; #ifndef NDEBUG Type *ShadowTy = Shadow->getType(); - assert( - (isa(ShadowTy) || isa(ShadowTy) || - isa(ShadowTy)) && - "Can only insert checks for integer, vector, and struct shadow types"); + assert((isa(ShadowTy) || isa(ShadowTy) || + isa(ShadowTy) || isa(ShadowTy)) && + "Can only insert checks for integer, vector, and aggregate shadow " + "types"); #endif InstrumentationList.push_back( ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns)); @@ -2679,9 +2693,7 @@ struct MemorySanitizerVisitor : public InstVisitor { void handleLifetimeStart(IntrinsicInst &I) { if (!PoisonStack) return; - DenseMap AllocaForValue; - AllocaInst *AI = - llvm::findAllocaForValue(I.getArgOperand(1), AllocaForValue); + AllocaInst *AI = llvm::findAllocaForValue(I.getArgOperand(1)); if (!AI) InstrumentLifetimeStart = false; LifetimeStartList.push_back(std::make_pair(&I, AI)); @@ -4056,6 +4068,12 @@ struct MemorySanitizerVisitor : public InstVisitor { setOrigin(&I, getCleanOrigin()); } + void visitFreezeInst(FreezeInst &I) { + // Freeze always returns a fully defined value. + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + } + void visitInstruction(Instruction &I) { // Everything else: stop propagating and check for poisoned shadow. if (ClDumpStrictInstructions) @@ -4738,15 +4756,14 @@ struct VarArgPowerPC64Helper : public VarArgHelper { // For PowerPC, we need to deal with alignment of stack arguments - // they are mostly aligned to 8 bytes, but vectors and i128 arrays // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes, - // and QPX vectors are aligned to 32 bytes. For that reason, we - // compute current offset from stack pointer (which is always properly - // aligned), and offset for the first vararg, then subtract them. + // For that reason, we compute current offset from stack pointer (which is + // always properly aligned), and offset for the first vararg, then subtract + // them. unsigned VAArgBase; Triple TargetTriple(F.getParent()->getTargetTriple()); // Parameter save area starts at 48 bytes from frame pointer for ABIv1, // and 32 bytes for ABIv2. This is usually determined by target // endianness, but in theory could be overridden by function attribute. - // For simplicity, we ignore it here (it'd only matter for QPX vectors). if (TargetTriple.getArch() == Triple::ppc64) VAArgBase = 48; else diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 67bf264e8eeaf..5a0185600e068 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -261,6 +261,10 @@ extern cl::opt PGOViewCounts; // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt ViewBlockFreqFuncName; +static cl::opt + PGOOldCFGHashing("pgo-instr-old-cfg-hashing", cl::init(false), cl::Hidden, + cl::desc("Use the old CFG function hashing")); + // Return a string describing the branch condition that can be // used in static branch probability heuristics: static std::string getBranchCondString(Instruction *TI) { @@ -620,7 +624,8 @@ template class FuncPGOInstrumentation { } // end anonymous namespace // Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index -// value of each BB in the CFG. The higher 32 bits record the number of edges. +// value of each BB in the CFG. The higher 32 bits are the CRC32 of the numbers +// of selects, indirect calls, mem ops and edges. template void FuncPGOInstrumentation::computeCFGHash() { std::vector Indexes; @@ -639,12 +644,31 @@ void FuncPGOInstrumentation::computeCFGHash() { } JC.update(Indexes); - // Hash format for context sensitive profile. Reserve 4 bits for other - // information. - FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 | - (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 | - //(uint64_t)ValueSites[IPVK_MemOPSize].size() << 40 | - (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC(); + JamCRC JCH; + if (PGOOldCFGHashing) { + // Hash format for context sensitive profile. Reserve 4 bits for other + // information. + FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 | + (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 | + //(uint64_t)ValueSites[IPVK_MemOPSize].size() << 40 | + (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC(); + } else { + // The higher 32 bits. + auto updateJCH = [&JCH](uint64_t Num) { + uint8_t Data[8]; + support::endian::write64le(Data, Num); + JCH.update(Data); + }; + updateJCH((uint64_t)SIVisitor.getNumOfSelectInsts()); + updateJCH((uint64_t)ValueSites[IPVK_IndirectCallTarget].size()); + updateJCH((uint64_t)ValueSites[IPVK_MemOPSize].size()); + updateJCH((uint64_t)MST.AllEdges.size()); + + // Hash format for context sensitive profile. Reserve 4 bits for other + // information. + FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC(); + } + // Reserve bit 60-63 for other information purpose. FunctionHash &= 0x0FFFFFFFFFFFFFFF; if (IsCS) @@ -653,8 +677,12 @@ void FuncPGOInstrumentation::computeCFGHash() { << " CRC = " << JC.getCRC() << ", Selects = " << SIVisitor.getNumOfSelectInsts() << ", Edges = " << MST.AllEdges.size() << ", ICSites = " - << ValueSites[IPVK_IndirectCallTarget].size() - << ", Hash = " << FunctionHash << "\n";); + << ValueSites[IPVK_IndirectCallTarget].size()); + if (!PGOOldCFGHashing) { + LLVM_DEBUG(dbgs() << ", Memops = " << ValueSites[IPVK_MemOPSize].size() + << ", High32 CRC = " << JCH.getCRC()); + } + LLVM_DEBUG(dbgs() << ", Hash = " << FunctionHash << "\n";); } // Check if we can safely rename this Comdat function. @@ -1020,7 +1048,8 @@ class PGOUseFunc { FreqAttr(FFA_Normal), IsCS(IsCS) {} // Read counts for the instrumented BB from profile. - bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros); + bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros, + bool &AllMinusOnes); // Populate the counts for all BBs. void populateCounters(); @@ -1131,11 +1160,18 @@ bool PGOUseFunc::setInstrumentedCounts( if (NumCounters != CountFromProfile.size()) { return false; } + auto *FuncEntry = &*F.begin(); + // Set the profile count to the Instrumented BBs. uint32_t I = 0; for (BasicBlock *InstrBB : InstrumentBBs) { uint64_t CountValue = CountFromProfile[I++]; UseBBInfo &Info = getBBInfo(InstrBB); + // If we reach here, we know that we have some nonzero count + // values in this function. The entry count should not be 0. + // Fix it if necessary. + if (InstrBB == FuncEntry && CountValue == 0) + CountValue = 1; Info.setBBInfoCount(CountValue); } ProfileCountSize = CountFromProfile.size(); @@ -1196,7 +1232,8 @@ void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) { // Read the profile from ProfileFileName and assign the value to the // instrumented BB and the edges. This function also updates ProgramMaxCount. // Return true if the profile are successfully read, and false on errors. -bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros) { +bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros, + bool &AllMinusOnes) { auto &Ctx = M->getContext(); Expected Result = PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash); @@ -1239,10 +1276,13 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros) IsCS ? NumOfCSPGOFunc++ : NumOfPGOFunc++; LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n"); + AllMinusOnes = (CountFromProfile.size() > 0); uint64_t ValueSum = 0; for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) { LLVM_DEBUG(dbgs() << " " << I << ": " << CountFromProfile[I] << "\n"); ValueSum += CountFromProfile[I]; + if (CountFromProfile[I] != (uint64_t)-1) + AllMinusOnes = false; } AllZeros = (ValueSum == 0); @@ -1326,7 +1366,6 @@ void PGOUseFunc::populateCounters() { } #endif uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue; - F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real)); uint64_t FuncMaxCount = FuncEntryCount; for (auto &BB : F) { auto BI = findBBInfo(&BB); @@ -1334,6 +1373,11 @@ void PGOUseFunc::populateCounters() { continue; FuncMaxCount = std::max(FuncMaxCount, BI->CountValue); } + + // Fix the obviously inconsistent entry count. + if (FuncMaxCount > 0 && FuncEntryCount == 0) + FuncEntryCount = 1; + F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real)); markFunctionAttributes(FuncEntryCount, FuncMaxCount); // Now annotate select instructions @@ -1646,8 +1690,13 @@ static bool annotateAllFunctions( SplitIndirectBrCriticalEdges(F, BPI, BFI); PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS, InstrumentFuncEntry); + // When AllMinusOnes is true, it means the profile for the function + // is unrepresentative and this function is actually hot. Set the + // entry count of the function to be multiple times of hot threshold + // and drop all its internal counters. + bool AllMinusOnes = false; bool AllZeros = false; - if (!Func.readCounters(PGOReader.get(), AllZeros)) + if (!Func.readCounters(PGOReader.get(), AllZeros, AllMinusOnes)) continue; if (AllZeros) { F.setEntryCount(ProfileCount(0, Function::PCT_Real)); @@ -1655,6 +1704,15 @@ static bool annotateAllFunctions( ColdFunctions.push_back(&F); continue; } + const unsigned MultiplyFactor = 3; + if (AllMinusOnes) { + uint64_t HotThreshold = PSI->getHotCountThreshold(); + if (HotThreshold) + F.setEntryCount( + ProfileCount(HotThreshold * MultiplyFactor, Function::PCT_Real)); + HotFunctions.push_back(&F); + continue; + } Func.populateCounters(); Func.setBranchWeights(); Func.annotateValueSites(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index 2b7b859891dcd..43a1434ae2d37 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -38,6 +38,8 @@ #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/ProfileData/InstrProf.h" +#define INSTR_PROF_VALUE_PROF_MEMOP_API +#include "llvm/ProfileData/InstrProfData.inc" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -89,17 +91,25 @@ static cl::opt cl::desc("Scale the memop size counts using the basic " " block count value")); +// FIXME: These are to be removed after switching to the new memop value +// profiling. // This option sets the rangge of precise profile memop sizes. extern cl::opt MemOPSizeRange; // This option sets the value that groups large memop sizes extern cl::opt MemOPSizeLarge; +extern cl::opt UseOldMemOpValueProf; + cl::opt MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), cl::Hidden, cl::desc("Size-specialize memcmp and bcmp calls")); +static cl::opt + MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), + cl::desc("Optimize the memop size <= this value")); + namespace { class PGOMemOPSizeOptLegacyPass : public FunctionPass { public: @@ -269,6 +279,8 @@ class MemOPSizeOpt : public InstVisitor { TargetLibraryInfo &TLI; bool Changed; std::vector WorkList; + // FIXME: These are to be removed after switching to the new memop value + // profiling. // Start of the previse range. int64_t PreciseRangeStart; // Last value of the previse range. @@ -277,6 +289,8 @@ class MemOPSizeOpt : public InstVisitor { std::unique_ptr ValueDataArray; bool perform(MemOp MO); + // FIXME: This is to be removed after switching to the new memop value + // profiling. // This kind shows which group the value falls in. For PreciseValue, we have // the profile count for that value. LargeGroup groups the values that are in // range [LargeValue, +inf). NonLargeGroup groups the rest of values. @@ -365,8 +379,11 @@ bool MemOPSizeOpt::perform(MemOp MO) { if (MemOPScaleCount) C = getScaledCount(C, ActualCount, SavedTotalCount); - // Only care precise value here. - if (getMemOPSizeKind(V) != PreciseValue) + if (UseOldMemOpValueProf) { + // Only care precise value here. + if (getMemOPSizeKind(V) != PreciseValue) + continue; + } else if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) continue; // ValueCounts are sorted on the count. Break at the first un-profitable diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 8ce12c514f0bc..b8d9b3061f07e 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -478,7 +478,7 @@ void ThreadSanitizer::chooseInstructionsToInstrument( } } - if (isa(GetUnderlyingObject(Addr, DL)) && + if (isa(getUnderlyingObject(Addr)) && !PointerMayBeCaptured(Addr, true, true)) { // The variable is addressable but not captured, so it cannot be // referenced from a different thread and participate in a data race diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 46bc586fe688a..39ba2361a17a7 100644 --- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -107,7 +107,7 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr, } else if (const StoreInst *SI = dyn_cast(Inst)) { // Special-case stores, because we don't care about the stored value, just // the store address. - const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand(), DL); + const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand()); // If we can't tell what the underlying object was, assume there is a // dependence. return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && diff --git a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp index c6138edba95ad..d528c65a90074 100644 --- a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp +++ b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp @@ -162,8 +162,8 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B, bool ProvenanceAnalysis::related(const Value *A, const Value *B, const DataLayout &DL) { - A = GetUnderlyingObjCPtrCached(A, DL, UnderlyingObjCPtrCache); - B = GetUnderlyingObjCPtrCached(B, DL, UnderlyingObjCPtrCache); + A = GetUnderlyingObjCPtrCached(A, UnderlyingObjCPtrCache); + B = GetUnderlyingObjCPtrCached(B, UnderlyingObjCPtrCache); // Quick check. if (A == B) diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 48968166c605f..397d62a5d21d3 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -973,11 +973,19 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F)); - if (!Changed) - return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserve(); - PA.preserve(); - PA.preserve(); + if (!Changed) { + PA = PreservedAnalyses::all(); + } else { + PA.preserve(); + PA.preserve(); + PA.preserve(); + } + + // Keeping LVI alive is expensive, both because it uses a lot of memory, and + // because invalidating values in LVI is expensive. While CVP does preserve + // LVI, we know that passes after JumpThreading+CVP will not need the result + // of this analysis, so we forcefully discard it early. + PA.abandon(); return PA; } diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 258fd5b9454f0..048b67a9f3b79 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -415,8 +415,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, // Check to see if the later store is to the entire object (either a global, // an alloca, or a byval/inalloca argument). If so, then it clearly // overwrites any other store to the same object. - const Value *UO1 = GetUnderlyingObject(P1, DL), - *UO2 = GetUnderlyingObject(P2, DL); + const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2); // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. @@ -739,7 +738,6 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, MemoryLocation Loc = MemoryLocation(F->getOperand(0)); SmallVector Blocks; Blocks.push_back(F->getParent()); - const DataLayout &DL = F->getModule()->getDataLayout(); while (!Blocks.empty()) { BasicBlock *BB = Blocks.pop_back_val(); @@ -755,7 +753,7 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, break; Value *DepPointer = - GetUnderlyingObject(getStoredPointerOperand(Dependency), DL); + getUnderlyingObject(getStoredPointerOperand(Dependency)); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) @@ -795,7 +793,7 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc, const DataLayout &DL, AliasAnalysis *AA, const TargetLibraryInfo *TLI, const Function *F) { - const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL); + const Value *UnderlyingPointer = getUnderlyingObject(LoadedLoc.Ptr); // A constant can't be in the dead pointer set. if (isa(UnderlyingPointer)) @@ -861,7 +859,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) { // See through pointer-to-pointer bitcasts SmallVector Pointers; - GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL); + getUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers); // Stores to stack values are valid candidates for removal. bool AllDead = true; @@ -1134,7 +1132,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, Constant *StoredConstant = dyn_cast(SI->getValueOperand()); if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) { Instruction *UnderlyingPointer = - dyn_cast(GetUnderlyingObject(SI->getPointerOperand(), DL)); + dyn_cast(getUnderlyingObject(SI->getPointerOperand())); if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA, DL, DT)) { @@ -1289,7 +1287,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, // to it is dead along the unwind edge. Otherwise, we need to preserve // the store. if (LastThrowing && DepWrite->comesBefore(LastThrowing)) { - const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL); + const Value *Underlying = getUnderlyingObject(DepLoc.Ptr); bool IsStoreDeadOnUnwind = isa(Underlying); if (!IsStoreDeadOnUnwind) { // We're looking for a call to an allocation function @@ -1715,7 +1713,7 @@ struct DSEState { // object can be considered terminated. if (MaybeTermLoc->second) { DataLayout DL = MaybeTerm->getParent()->getModule()->getDataLayout(); - DefLoc = MemoryLocation(GetUnderlyingObject(DefLoc.Ptr, DL)); + DefLoc = MemoryLocation(getUnderlyingObject(DefLoc.Ptr)); } return AA.isMustAlias(MaybeTermLoc->first, DefLoc); } @@ -2030,7 +2028,6 @@ struct DSEState { /// Eliminate writes to objects that are not visible in the caller and are not /// accessed before returning from the function. bool eliminateDeadWritesAtEndOfFunction() { - const DataLayout &DL = F.getParent()->getDataLayout(); bool MadeChange = false; LLVM_DEBUG( dbgs() @@ -2047,7 +2044,7 @@ struct DSEState { Instruction *DefI = Def->getMemoryInst(); // See through pointer-to-pointer bitcasts SmallVector Pointers; - GetUnderlyingObjects(getLocForWriteEx(DefI)->Ptr, Pointers, DL); + getUnderlyingObjects(getLocForWriteEx(DefI)->Ptr, Pointers); LLVM_DEBUG(dbgs() << " ... MemoryDef is not accessed until the end " "of the function\n"); @@ -2130,7 +2127,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, } MemoryLocation SILoc = *MaybeSILoc; assert(SILoc.Ptr && "SILoc should not be null"); - const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL); + const Value *SILocUnd = getUnderlyingObject(SILoc.Ptr); // Check if the store is a no-op. if (isRemovable(SI) && State.storeIsNoop(KillingDef, SILoc, SILocUnd)) { @@ -2231,7 +2228,7 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, MemoryLocation NILoc = *State.getLocForWriteEx(NI); if (State.isMemTerminatorInst(SI)) { - const Value *NIUnd = GetUnderlyingObject(NILoc.Ptr, DL); + const Value *NIUnd = getUnderlyingObject(NILoc.Ptr); if (!SILocUnd || SILocUnd != NIUnd) continue; LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 0b416cc4afb86..1d82664ed4076 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -98,21 +98,30 @@ STATISTIC(NumGVNSimpl, "Number of instructions simplified"); STATISTIC(NumGVNEqProp, "Number of equalities propagated"); STATISTIC(NumPRELoad, "Number of loads PRE'd"); +STATISTIC(IsValueFullyAvailableInBlockNumSpeculationsMax, + "Number of blocks speculated as available in " + "IsValueFullyAvailableInBlock(), max"); +STATISTIC(MaxBBSpeculationCutoffReachedTimes, + "Number of times we we reached gvn-max-block-speculations cut-off " + "preventing further exploration"); + static cl::opt GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden); static cl::opt GVNEnableLoadPRE("enable-load-pre", cl::init(true)); static cl::opt GVNEnableLoadInLoopPRE("enable-load-in-loop-pre", cl::init(true)); static cl::opt GVNEnableMemDep("enable-gvn-memdep", cl::init(true)); -// Maximum allowed recursion depth. -static cl::opt -MaxRecurseDepth("gvn-max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, - cl::desc("Max recurse depth in GVN (default = 1000)")); - static cl::opt MaxNumDeps( "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore, cl::desc("Max number of dependences to attempt Load PRE (default = 100)")); +// This is based on IsValueFullyAvailableInBlockNumSpeculationsMax stat. +static cl::opt MaxBBSpeculations( + "gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::ZeroOrMore, + cl::desc("Max number of blocks we're willing to speculate on (and recurse " + "into) when deducing if a value is fully avaliable or not in GVN " + "(default = 600)")); + struct llvm::GVN::Expression { uint32_t opcode; bool commutative = false; @@ -669,15 +678,14 @@ LLVM_DUMP_METHOD void GVN::dump(DenseMap& d) const { enum class AvaliabilityState : char { /// We know the block *is not* fully available. This is a fixpoint. - Unavaliable = 0, + Unavailable = 0, /// We know the block *is* fully available. This is a fixpoint. - Avaliable = 1, + Available = 1, /// We do not know whether the block is fully available or not, /// but we are currently speculating that it will be. - SpeculativelyAvaliable = 2, - /// We are speculating for this block and have used that - /// to speculate for other blocks. - SpeculativelyAvaliableAndUsedForSpeculation = 3, + /// If it would have turned out that the block was, in fact, not fully + /// available, this would have been cleaned up into an Unavailable. + SpeculativelyAvailable = 2, }; /// Return true if we can prove that the value @@ -688,80 +696,118 @@ enum class AvaliabilityState : char { /// 1) we know the block *is* fully available. /// 2) we do not know whether the block is fully available or not, but we are /// currently speculating that it will be. -/// 3) we are speculating for this block and have used that to speculate for -/// other blocks. static bool IsValueFullyAvailableInBlock( BasicBlock *BB, - DenseMap &FullyAvailableBlocks, - uint32_t RecurseDepth) { - if (RecurseDepth > MaxRecurseDepth) - return false; - - // Optimistically assume that the block is speculatively available and check - // to see if we already know about this block in one lookup. - std::pair::iterator, bool> IV = - FullyAvailableBlocks.insert( - std::make_pair(BB, AvaliabilityState::SpeculativelyAvaliable)); - - // If the entry already existed for this block, return the precomputed value. - if (!IV.second) { - // If this is a speculative "available" value, mark it as being used for - // speculation of other blocks. - if (IV.first->second == AvaliabilityState::SpeculativelyAvaliable) - IV.first->second = - AvaliabilityState::SpeculativelyAvaliableAndUsedForSpeculation; - return IV.first->second != AvaliabilityState::Unavaliable; - } + DenseMap &FullyAvailableBlocks) { + SmallVector Worklist; + Optional UnavailableBB; - // Otherwise, see if it is fully available in all predecessors. - pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + // The number of times we didn't find an entry for a block in a map and + // optimistically inserted an entry marking block as speculatively avaliable. + unsigned NumNewNewSpeculativelyAvailableBBs = 0; - // If this block has no predecessors, it isn't live-in here. - if (PI == PE) - goto SpeculationFailure; +#ifndef NDEBUG + SmallSet NewSpeculativelyAvailableBBs; + SmallVector AvailableBBs; +#endif - for (; PI != PE; ++PI) - // If the value isn't fully available in one of our predecessors, then it - // isn't fully available in this block either. Undo our previous - // optimistic assumption and bail out. - if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1)) - goto SpeculationFailure; + Worklist.emplace_back(BB); + while (!Worklist.empty()) { + BasicBlock *CurrBB = Worklist.pop_back_val(); // LIFO - depth-first! + // Optimistically assume that the block is Speculatively Available and check + // to see if we already know about this block in one lookup. + std::pair::iterator, bool> IV = + FullyAvailableBlocks.try_emplace( + CurrBB, AvaliabilityState::SpeculativelyAvailable); + AvaliabilityState &State = IV.first->second; + + // Did the entry already exist for this block? + if (!IV.second) { + if (State == AvaliabilityState::Unavailable) { + UnavailableBB = CurrBB; + break; // Backpropagate unavaliability info. + } - return true; +#ifndef NDEBUG + AvailableBBs.emplace_back(CurrBB); +#endif + continue; // Don't recurse further, but continue processing worklist. + } -// If we get here, we found out that this is not, after -// all, a fully-available block. We have a problem if we speculated on this and -// used the speculation to mark other blocks as available. -SpeculationFailure: - AvaliabilityState &BBVal = FullyAvailableBlocks[BB]; + // No entry found for block. + ++NumNewNewSpeculativelyAvailableBBs; + bool OutOfBudget = NumNewNewSpeculativelyAvailableBBs > MaxBBSpeculations; + + // If we have exhausted our budget, mark this block as unavailable. + // Also, if this block has no predecessors, the value isn't live-in here. + if (OutOfBudget || pred_empty(CurrBB)) { + MaxBBSpeculationCutoffReachedTimes += (int)OutOfBudget; + State = AvaliabilityState::Unavailable; + UnavailableBB = CurrBB; + break; // Backpropagate unavaliability info. + } - // If we didn't speculate on this, just return with it set to unavaliable. - if (BBVal == AvaliabilityState::SpeculativelyAvaliable) { - BBVal = AvaliabilityState::Unavaliable; - return false; + // Tentatively consider this block as speculatively available. +#ifndef NDEBUG + NewSpeculativelyAvailableBBs.insert(CurrBB); +#endif + // And further recurse into block's predecessors, in depth-first order! + Worklist.append(pred_begin(CurrBB), pred_end(CurrBB)); } - // If we did speculate on this value, we could have blocks set to - // speculatively avaliable that are incorrect. Walk the (transitive) - // successors of this block and mark them as unavaliable instead. - SmallVector BBWorklist; - BBWorklist.push_back(BB); - - do { - BasicBlock *Entry = BBWorklist.pop_back_val(); - // Note that this sets blocks to unavailable if they happen to not - // already be in FullyAvailableBlocks. This is safe. - AvaliabilityState &EntryVal = FullyAvailableBlocks[Entry]; - if (EntryVal == AvaliabilityState::Unavaliable) - continue; // Already unavailable. - - // Mark as unavailable. - EntryVal = AvaliabilityState::Unavaliable; +#if LLVM_ENABLE_STATS + IsValueFullyAvailableInBlockNumSpeculationsMax.updateMax( + NumNewNewSpeculativelyAvailableBBs); +#endif - BBWorklist.append(succ_begin(Entry), succ_end(Entry)); - } while (!BBWorklist.empty()); + // If the block isn't marked as fixpoint yet + // (the Unavailable and Available states are fixpoints) + auto MarkAsFixpointAndEnqueueSuccessors = + [&](BasicBlock *BB, AvaliabilityState FixpointState) { + auto It = FullyAvailableBlocks.find(BB); + if (It == FullyAvailableBlocks.end()) + return; // Never queried this block, leave as-is. + switch (AvaliabilityState &State = It->second) { + case AvaliabilityState::Unavailable: + case AvaliabilityState::Available: + return; // Don't backpropagate further, continue processing worklist. + case AvaliabilityState::SpeculativelyAvailable: // Fix it! + State = FixpointState; +#ifndef NDEBUG + assert(NewSpeculativelyAvailableBBs.erase(BB) && + "Found a speculatively available successor leftover?"); +#endif + // Queue successors for further processing. + Worklist.append(succ_begin(BB), succ_end(BB)); + return; + } + }; + + if (UnavailableBB) { + // Okay, we have encountered an unavailable block. + // Mark speculatively available blocks reachable from UnavailableBB as + // unavailable as well. Paths are terminated when they reach blocks not in + // FullyAvailableBlocks or they are not marked as speculatively available. + Worklist.clear(); + Worklist.append(succ_begin(*UnavailableBB), succ_end(*UnavailableBB)); + while (!Worklist.empty()) + MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(), + AvaliabilityState::Unavailable); + } + +#ifndef NDEBUG + Worklist.clear(); + for (BasicBlock *AvailableBB : AvailableBBs) + Worklist.append(succ_begin(AvailableBB), succ_end(AvailableBB)); + while (!Worklist.empty()) + MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(), + AvaliabilityState::Available); + + assert(NewSpeculativelyAvailableBBs.empty() && + "Must have fixed all the new speculatively available blocks."); +#endif - return false; + return !UnavailableBB; } /// Given a set of loads specified by ValuesPerBlock, @@ -1126,9 +1172,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, MapVector PredLoads; DenseMap FullyAvailableBlocks; for (const AvailableValueInBlock &AV : ValuesPerBlock) - FullyAvailableBlocks[AV.BB] = AvaliabilityState::Avaliable; + FullyAvailableBlocks[AV.BB] = AvaliabilityState::Available; for (BasicBlock *UnavailableBB : UnavailableBlocks) - FullyAvailableBlocks[UnavailableBB] = AvaliabilityState::Unavaliable; + FullyAvailableBlocks[UnavailableBB] = AvaliabilityState::Unavailable; SmallVector CriticalEdgePred; for (BasicBlock *Pred : predecessors(LoadBB)) { @@ -1141,7 +1187,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; } - if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) { continue; } diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index dfb4b7e038ba5..5a34ad18158b4 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -158,8 +158,7 @@ class LockstepReverseIterator { void restrictToBlocks(SmallSetVector &Blocks) { for (auto II = Insts.begin(); II != Insts.end();) { - if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) == - Blocks.end()) { + if (!llvm::is_contained(Blocks, (*II)->getParent())) { ActiveBlocks.remove((*II)->getParent()); II = Insts.erase(II); } else { @@ -277,8 +276,7 @@ class ModelledPHI { auto VI = Values.begin(); while (BI != Blocks.end()) { assert(VI != Values.end()); - if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) == - NewBlocks.end()) { + if (!llvm::is_contained(NewBlocks, *BI)) { BI = Blocks.erase(BI); VI = Values.erase(VI); } else { diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 9d0500419a7f5..e6d261fa9aff5 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -674,12 +674,9 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl( return !Result.empty(); } - // Handle Cast instructions. Only see through Cast when the source operand is - // PHI or Cmp to save the compilation time. + // Handle Cast instructions. if (CastInst *CI = dyn_cast(I)) { Value *Source = CI->getOperand(0); - if (!isa(Source) && !isa(Source)) - return false; ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference, RecursionSet, CxtI); if (Result.empty()) @@ -692,6 +689,18 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl( return true; } + if (FreezeInst *FI = dyn_cast(I)) { + Value *Source = FI->getOperand(0); + ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference, + RecursionSet, CxtI); + + erase_if(Result, [](auto &Pair) { + return !isGuaranteedNotToBeUndefOrPoison(Pair.first); + }); + + return !Result.empty(); + } + // Handle some boolean conditions. if (I->getType()->getPrimitiveSizeInBits() == 1) { assert(Preference == WantInteger && "One-bit non-integer type?"); @@ -1040,9 +1049,11 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { } } - // If the terminator is branching on an undef, we can pick any of the - // successors to branch to. Let GetBestDestForJumpOnUndef decide. - if (isa(Condition)) { + // If the terminator is branching on an undef or freeze undef, we can pick any + // of the successors to branch to. Let GetBestDestForJumpOnUndef decide. + auto *FI = dyn_cast(Condition); + if (isa(Condition) || + (FI && isa(FI->getOperand(0)) && FI->hasOneUse())) { unsigned BestSucc = GetBestDestForJumpOnUndef(BB); std::vector Updates; @@ -1061,6 +1072,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); BBTerm->eraseFromParent(); DTU->applyUpdatesPermissive(Updates); + if (FI) + FI->eraseFromParent(); return true; } @@ -1147,6 +1160,11 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // we see one, check to see if it's partially redundant. If so, insert a PHI // which can then be used to thread the values. Value *SimplifyValue = CondInst; + + if (auto *FI = dyn_cast(SimplifyValue)) + // Look into freeze's operand + SimplifyValue = FI->getOperand(0); + if (CmpInst *CondCmp = dyn_cast(SimplifyValue)) if (isa(CondCmp->getOperand(1))) SimplifyValue = CondCmp->getOperand(0); @@ -1859,6 +1877,14 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) { return true; } + // If any of predecessors end with an indirect goto, we can't change its + // destination. Same for CallBr. + if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) { + return isa(Pred->getTerminator()) || + isa(Pred->getTerminator()); + })) + return false; + // Try to duplicate BB into PredBB. return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); } diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 1a22edaf87266..4bf39ba8f151c 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1909,7 +1909,7 @@ bool llvm::promoteLoopAccessesToScalars( // we have to prove that the store is dead along the unwind edge. We do // this by proving that the caller can't have a reference to the object // after return and thus can't possibly load from the object. - Value *Object = GetUnderlyingObject(SomePtr, MDL); + Value *Object = getUnderlyingObject(SomePtr); if (!isKnownNonEscaping(Object, TLI)) return false; // Subtlety: Alloca's aren't visible to callers, but *are* potentially @@ -2041,7 +2041,7 @@ bool llvm::promoteLoopAccessesToScalars( if (IsKnownThreadLocalObject) SafeToInsertStore = true; else { - Value *Object = GetUnderlyingObject(SomePtr, MDL); + Value *Object = getUnderlyingObject(SomePtr); SafeToInsertStore = (isAllocLikeFn(Object, TLI) || isa(Object)) && !PointerMayBeCaptured(Object, true, true); diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 5573af834c597..90356ed0f2a75 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -66,7 +66,7 @@ #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CodeMoverUtils.h" -#include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Transforms/Utils/LoopPeel.h" using namespace llvm; diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index fc980cf3689fb..52993c9dcb091 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -564,12 +564,12 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { break; case LegalStoreKind::Memset: { // Find the base pointer. - Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); StoreRefsForMemset[Ptr].push_back(SI); } break; case LegalStoreKind::MemsetPattern: { // Find the base pointer. - Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); StoreRefsForMemsetPattern[Ptr].push_back(SI); } break; case LegalStoreKind::Memcpy: diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 7787c0bccd4ce..3cfe1b595571e 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -1521,8 +1521,7 @@ bool LoopInterchangeTransform::adjustLoopBranches() { InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); // The outer loop header might or might not branch to the outer latch. // We are guaranteed to branch to the inner loop preheader. - if (std::find(succ_begin(OuterLoopHeaderBI), succ_end(OuterLoopHeaderBI), - OuterLoopLatch) != succ_end(OuterLoopHeaderBI)) + if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates, /*MustUpdateOnce=*/false); updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 98889a9df1163..91de5715a6ae3 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -39,9 +39,6 @@ PassManager(*Pass, L)) continue; - if (DebugLogging) - dbgs() << "Running pass: " << Pass->name() << " on " << L; - PreservedAnalyses PassPA; { TimeTraceScope TimeScope(Pass->name(), L.getName()); diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 031e5b9c1d2c9..8ab3c2ca118d4 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -366,15 +366,20 @@ class ConstantTerminatorFoldingImpl { unsigned DummyIdx = 1; for (BasicBlock *BB : DeadExitBlocks) { - SmallVector DeadPhis; + // Eliminate all Phis and LandingPads from dead exits. + // TODO: Consider removing all instructions in this dead block. + SmallVector DeadInstructions; for (auto &PN : BB->phis()) - DeadPhis.push_back(&PN); + DeadInstructions.push_back(&PN); - // Eliminate all Phis from dead exits. - for (Instruction *PN : DeadPhis) { - PN->replaceAllUsesWith(UndefValue::get(PN->getType())); - PN->eraseFromParent(); + if (auto *LandingPad = dyn_cast(BB->getFirstNonPHI())) + DeadInstructions.emplace_back(LandingPad); + + for (Instruction *I : DeadInstructions) { + I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->eraseFromParent(); } + assert(DummyIdx != 0 && "Too many dead exits!"); DummySwitch->addCase(Builder.getInt32(DummyIdx++), BB); DTUpdates.push_back({DominatorTree::Insert, Preheader, BB}); diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index cf02ef1e83f3f..c3e46c1fadef3 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -5514,8 +5514,8 @@ void LSRInstance::ImplementSolution( // we can remove them after we are done working. SmallVector DeadInsts; - SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), - "lsr"); + SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr", + false); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif @@ -5780,7 +5780,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector DeadInsts; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Rewriter(SE, DL, "lsr"); + SCEVExpander Rewriter(SE, DL, "lsr", false); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 285cba6ee2054..bd62419323065 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -41,6 +41,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 87f40bb7ba852..2b610392dcfd5 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -56,6 +56,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SizeOpts.h" @@ -115,10 +116,6 @@ static cl::opt UnrollFullMaxCount( cl::desc( "Set the max unroll count for full unrolling, for testing purposes")); -static cl::opt UnrollPeelCount( - "unroll-peel-count", cl::Hidden, - cl::desc("Set the unroll peeling count, for testing purposes")); - static cl::opt UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " @@ -149,15 +146,6 @@ static cl::opt FlatLoopTripCountThreshold( "threshold, the loop is considered as flat and will be less " "aggressively unrolled.")); -static cl::opt - UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden, - cl::desc("Allows loops to be peeled when the dynamic " - "trip count is known to be low.")); - -static cl::opt UnrollAllowLoopNestsPeeling( - "unroll-allow-loop-nests-peeling", cl::init(false), cl::Hidden, - cl::desc("Allows loop nests to be peeled.")); - static cl::opt UnrollUnrollRemainder( "unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled.")); @@ -275,39 +263,6 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( return UP; } -TargetTransformInfo::PeelingPreferences -llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, - const TargetTransformInfo &TTI, - Optional UserAllowPeeling, - Optional UserAllowProfileBasedPeeling) { - TargetTransformInfo::PeelingPreferences PP; - - // Default values - PP.PeelCount = 0; - PP.AllowPeeling = true; - PP.AllowLoopNestsPeeling = false; - PP.PeelProfiledIterations = true; - - // Get Target Specifc Values - TTI.getPeelingPreferences(L, SE, PP); - - // User Specified Values using cl::opt - if (UnrollPeelCount.getNumOccurrences() > 0) - PP.PeelCount = UnrollPeelCount; - if (UnrollAllowPeeling.getNumOccurrences() > 0) - PP.AllowPeeling = UnrollAllowPeeling; - if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0) - PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling; - - // User Specifed values provided by argument - if (UserAllowPeeling.hasValue()) - PP.AllowPeeling = *UserAllowPeeling; - if (UserAllowProfileBasedPeeling.hasValue()) - PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling; - - return PP; -} - namespace { /// A struct to densely store the state of an instruction after unrolling at @@ -881,7 +836,7 @@ bool llvm::computeUnrollCount( } // 4th priority is loop peeling. - computePeelCount(L, LoopSize, UP, PP, TripCount, SE); + computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold); if (PP.PeelCount) { UP.Runtime = false; UP.Count = 1; @@ -1087,7 +1042,7 @@ static LoopUnrollResult tryToUnrollLoop( ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, ProvidedFullUnrollMaxCount); TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences( - L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling); + L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true); // Exit early if unrolling is disabled. For OptForSize, we pick the loop size // as threshold later on. diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index a9332356b3713..053d96430b6b3 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1562,7 +1562,7 @@ class LowerMatrixIntrinsics { if (Value *Ptr = getPointerOperand(V)) return getUnderlyingObjectThroughLoads(Ptr); else if (V->getType()->isPointerTy()) - return GetUnderlyingObject(V, DL); + return getUnderlyingObject(V); return V; } diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index dc2ad14ae61e5..c344c6c684776 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -2025,8 +2025,8 @@ chainToBasePointerCost(SmallVectorImpl &Chain, Type *SrcTy = CI->getOperand(0)->getType(); Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, - TargetTransformInfo::TCK_SizeAndLatency, - CI); + TTI::getCastContextHint(CI), + TargetTransformInfo::TCK_SizeAndLatency, CI); } else if (GetElementPtrInst *GEP = dyn_cast(Instr)) { // Cost of the address calculation diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 32dc14e5ec195..9c9f483ab103d 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -276,7 +276,7 @@ class SCCPSolver : public InstVisitor { // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. - bool isEdgeFeasible(BasicBlock *From, BasicBlock *To); + bool isEdgeFeasible(BasicBlock *From, BasicBlock *To) const; std::vector getStructLatticeValueFor(Value *V) const { std::vector StructValues; @@ -649,17 +649,30 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI, Succs[0] = true; return; } - ValueLatticeElement SCValue = getValueState(SI->getCondition()); - ConstantInt *CI = getConstantInt(SCValue); + const ValueLatticeElement &SCValue = getValueState(SI->getCondition()); + if (ConstantInt *CI = getConstantInt(SCValue)) { + Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true; + return; + } - if (!CI) { // Overdefined or unknown condition? - // All destinations are executable! - if (!SCValue.isUnknownOrUndef()) - Succs.assign(TI.getNumSuccessors(), true); + // TODO: Switch on undef is UB. Stop passing false once the rest of LLVM + // is ready. + if (SCValue.isConstantRange(/*UndefAllowed=*/false)) { + const ConstantRange &Range = SCValue.getConstantRange(); + for (const auto &Case : SI->cases()) { + const APInt &CaseValue = Case.getCaseValue()->getValue(); + if (Range.contains(CaseValue)) + Succs[Case.getSuccessorIndex()] = true; + } + + // TODO: Determine whether default case is reachable. + Succs[SI->case_default()->getSuccessorIndex()] = true; return; } - Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true; + // Overdefined or unknown condition? All destinations are executable! + if (!SCValue.isUnknownOrUndef()) + Succs.assign(TI.getNumSuccessors(), true); return; } @@ -705,7 +718,7 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI, // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. -bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { +bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // Check if we've called markEdgeExecutable on the edge yet. (We could // be more aggressive and try to consider edges which haven't been marked // yet, but there isn't any need.) @@ -1807,39 +1820,68 @@ static void findReturnsToZap(Function &F, } } -// Update the condition for terminators that are branching on indeterminate -// values, forcing them to use a specific edge. -static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) { - BasicBlock *Dest = nullptr; - Constant *C = nullptr; - if (SwitchInst *SI = dyn_cast(I)) { - if (!isa(SI->getCondition())) { - // Indeterminate switch; use first case value. - Dest = SI->case_begin()->getCaseSuccessor(); - C = SI->case_begin()->getCaseValue(); - } - } else if (BranchInst *BI = dyn_cast(I)) { - if (!isa(BI->getCondition())) { - // Indeterminate branch; use false. - Dest = BI->getSuccessor(1); - C = ConstantInt::getFalse(BI->getContext()); +static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, + DomTreeUpdater &DTU) { + SmallPtrSet FeasibleSuccessors; + bool HasNonFeasibleEdges = false; + for (BasicBlock *Succ : successors(BB)) { + if (Solver.isEdgeFeasible(BB, Succ)) + FeasibleSuccessors.insert(Succ); + else + HasNonFeasibleEdges = true; + } + + // All edges feasible, nothing to do. + if (!HasNonFeasibleEdges) + return false; + + // SCCP can only determine non-feasible edges for br, switch and indirectbr. + Instruction *TI = BB->getTerminator(); + assert((isa(TI) || isa(TI) || + isa(TI)) && + "Terminator must be a br, switch or indirectbr"); + + if (FeasibleSuccessors.size() == 1) { + // Replace with an unconditional branch to the only feasible successor. + BasicBlock *OnlyFeasibleSuccessor = *FeasibleSuccessors.begin(); + SmallVector Updates; + bool HaveSeenOnlyFeasibleSuccessor = false; + for (BasicBlock *Succ : successors(BB)) { + if (Succ == OnlyFeasibleSuccessor && !HaveSeenOnlyFeasibleSuccessor) { + // Don't remove the edge to the only feasible successor the first time + // we see it. We still do need to remove any multi-edges to it though. + HaveSeenOnlyFeasibleSuccessor = true; + continue; + } + + Succ->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, Succ}); } - } else if (IndirectBrInst *IBR = dyn_cast(I)) { - if (!isa(IBR->getAddress()->stripPointerCasts())) { - // Indeterminate indirectbr; use successor 0. - Dest = IBR->getSuccessor(0); - C = BlockAddress::get(IBR->getSuccessor(0)); + + BranchInst::Create(OnlyFeasibleSuccessor, BB); + TI->eraseFromParent(); + DTU.applyUpdatesPermissive(Updates); + } else if (FeasibleSuccessors.size() > 1) { + SwitchInstProfUpdateWrapper SI(*cast(TI)); + SmallVector Updates; + for (auto CI = SI->case_begin(); CI != SI->case_end();) { + if (FeasibleSuccessors.contains(CI->getCaseSuccessor())) { + ++CI; + continue; + } + + BasicBlock *Succ = CI->getCaseSuccessor(); + Succ->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, Succ}); + SI.removeCase(CI); + // Don't increment CI, as we removed a case. } - } else { - llvm_unreachable("Unexpected terminator instruction"); - } - if (C) { - assert(Solver.isEdgeFeasible(I->getParent(), Dest) && - "Didn't find feasible edge?"); - (void)Dest; - I->setOperand(0, C); + DTU.applyUpdatesPermissive(Updates); + } else { + llvm_unreachable("Must have at least one feasible successor"); } + return true; } bool llvm::runIPSCCP( @@ -1911,15 +1953,35 @@ bool llvm::runIPSCCP( SmallVector BlocksToErase; - if (Solver.isBlockExecutable(&F.front())) - for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; - ++AI) { - if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) { + if (Solver.isBlockExecutable(&F.front())) { + bool ReplacedPointerArg = false; + for (Argument &Arg : F.args()) { + if (!Arg.use_empty() && tryToReplaceWithConstant(Solver, &Arg)) { + ReplacedPointerArg |= Arg.getType()->isPointerTy(); ++IPNumArgsElimed; - continue; } } + // If we replaced an argument, the argmemonly and + // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove + // them from both the function and callsites. + if (ReplacedPointerArg) { + SmallVector AttributesToRemove = { + Attribute::ArgMemOnly, Attribute::InaccessibleMemOrArgMemOnly}; + for (auto Attr : AttributesToRemove) + F.removeFnAttr(Attr); + + for (User *U : F.users()) { + auto *CB = dyn_cast(U); + if (!CB || CB->getCalledFunction() != &F) + continue; + + for (auto Attr : AttributesToRemove) + CB->removeAttribute(AttributeList::FunctionIndex, Attr); + } + } + } + SmallPtrSet InsertedValues; for (BasicBlock &BB : F) { if (!Solver.isBlockExecutable(&BB)) { @@ -1952,45 +2014,11 @@ bool llvm::runIPSCCP( /*UseLLVMTrap=*/false, /*PreserveLCSSA=*/false, &DTU); - // Now that all instructions in the function are constant folded, - // use ConstantFoldTerminator to get rid of in-edges, record DT updates and - // delete dead BBs. - for (BasicBlock *DeadBB : BlocksToErase) { - // If there are any PHI nodes in this successor, drop entries for BB now. - for (Value::user_iterator UI = DeadBB->user_begin(), - UE = DeadBB->user_end(); - UI != UE;) { - // Grab the user and then increment the iterator early, as the user - // will be deleted. Step past all adjacent uses from the same user. - auto *I = dyn_cast(*UI); - do { ++UI; } while (UI != UE && *UI == I); - - // Ignore blockaddress users; BasicBlock's dtor will handle them. - if (!I) continue; - - // If we have forced an edge for an indeterminate value, then force the - // terminator to fold to that edge. - forceIndeterminateEdge(I, Solver); - BasicBlock *InstBB = I->getParent(); - bool Folded = ConstantFoldTerminator(InstBB, - /*DeleteDeadConditions=*/false, - /*TLI=*/nullptr, &DTU); - assert(Folded && - "Expect TermInst on constantint or blockaddress to be folded"); - (void) Folded; - // If we folded the terminator to an unconditional branch to another - // dead block, replace it with Unreachable, to avoid trying to fold that - // branch again. - BranchInst *BI = cast(InstBB->getTerminator()); - if (BI && BI->isUnconditional() && - !Solver.isBlockExecutable(BI->getSuccessor(0))) { - InstBB->getTerminator()->eraseFromParent(); - new UnreachableInst(InstBB->getContext(), InstBB); - } - } - // Mark dead BB for deletion. + for (BasicBlock &BB : F) + MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU); + + for (BasicBlock *DeadBB : BlocksToErase) DTU.deleteBB(DeadBB); - } for (BasicBlock &BB : F) { for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 89f324deef9fd..a95a7aba0d38d 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -920,6 +920,9 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { // FIXME: What about debug intrinsics? This matches old behavior, but // doesn't make sense. void visitIntrinsicInst(IntrinsicInst &II) { + if (II.isDroppable()) + return; + if (!IsOffsetKnown) return PI.setAborted(&II); @@ -1825,7 +1828,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. } else if (IntrinsicInst *II = dyn_cast(U->getUser())) { - if (!II->isLifetimeStartOrEnd()) + if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) return false; } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { // Disable vector promotion when there are loads or stores of an FCA. @@ -2058,7 +2061,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. } else if (IntrinsicInst *II = dyn_cast(U->getUser())) { - if (!II->isLifetimeStartOrEnd()) + if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) return false; } else { return false; @@ -2778,7 +2781,7 @@ class llvm::sroa::AllocaSliceRewriter Type *AllocaTy = NewAI.getAllocatedType(); Type *ScalarTy = AllocaTy->getScalarType(); - + const bool CanContinue = [&]() { if (VecTy || IntTy) return true; @@ -3074,13 +3077,21 @@ class llvm::sroa::AllocaSliceRewriter } bool visitIntrinsicInst(IntrinsicInst &II) { - assert(II.isLifetimeStartOrEnd()); + assert((II.isLifetimeStartOrEnd() || II.isDroppable()) && + "Unexpected intrinsic!"); LLVM_DEBUG(dbgs() << " original: " << II << "\n"); - assert(II.getArgOperand(1) == OldPtr); // Record this instruction for deletion. Pass.DeadInsts.insert(&II); + if (II.isDroppable()) { + assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume"); + // TODO For now we forget assumed information, this can be improved. + OldPtr->dropDroppableUsesIn(II); + return true; + } + + assert(II.getArgOperand(1) == OldPtr); // Lifetime intrinsics are only promotable if they cover the whole alloca. // Therefore, we drop lifetime intrinsics which don't cover the whole // alloca. diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index a0598447030c0..8bc35d5228df8 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -110,7 +110,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopDistributeLegacyPass(Registry); initializeLoopLoadEliminationPass(Registry); initializeLoopSimplifyCFGLegacyPassPass(Registry); - initializeLoopVersioningPassPass(Registry); + initializeLoopVersioningLegacyPassPass(Registry); initializeEntryExitInstrumenterPass(Registry); initializePostInlineEntryExitInstrumenterPass(Registry); } diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 6c6d6ca9cf656..ab1945a1aff11 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -26,6 +26,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -93,6 +94,11 @@ static cl::opt UnswitchGuards( "simple-loop-unswitch-guards", cl::init(true), cl::Hidden, cl::desc("If enabled, simple loop unswitching will also consider " "llvm.experimental.guard intrinsics as unswitch candidates.")); +static cl::opt DropNonTrivialImplicitNullChecks( + "simple-loop-unswitch-drop-non-trivial-implicit-null-checks", + cl::init(false), cl::Hidden, + cl::desc("If enabled, drop make.implicit metadata in unswitched implicit " + "null checks to save time analyzing if we can keep it.")); /// Collect all of the loop invariant input values transitively used by the /// homogeneous instruction graph from a given root. @@ -2070,6 +2076,23 @@ static void unswitchNontrivialInvariants( DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU); } + // Drop metadata if we may break its semantics by moving this instr into the + // split block. + if (TI.getMetadata(LLVMContext::MD_make_implicit)) { + if (DropNonTrivialImplicitNullChecks) + // Do not spend time trying to understand if we can keep it, just drop it + // to save compile time. + TI.setMetadata(LLVMContext::MD_make_implicit, nullptr); + else { + // It is only legal to preserve make.implicit metadata if we are + // guaranteed no reach implicit null check after following this branch. + ICFLoopSafetyInfo SafetyInfo; + SafetyInfo.computeLoopSafetyInfo(&L); + if (!SafetyInfo.isGuaranteedToExecute(TI, &DT, &L)) + TI.setMetadata(LLVMContext::MD_make_implicit, nullptr); + } + } + // The stitching of the branched code back together depends on whether we're // doing full unswitching or not with the exception that we always want to // nuke the initial terminator placed in the split block. @@ -2844,7 +2867,6 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, ScalarEvolution *SE, MemorySSAUpdater *MSSAU) { assert(L.isRecursivelyLCSSAForm(DT, LI) && "Loops must be in LCSSA form before unswitching."); - bool Changed = false; // Must be in loop simplified form: we need a preheader and dedicated exits. if (!L.isLoopSimplifyForm()) @@ -2876,7 +2898,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, return true; // No other opportunities to unswitch. - return Changed; + return false; } PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index db5211df397a8..b0435bf6e4eac 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -63,8 +63,8 @@ static cl::opt UserForwardSwitchCond( cl::desc("Forward switch condition to phi ops (default = false)")); static cl::opt UserHoistCommonInsts( - "hoist-common-insts", cl::Hidden, cl::init(true), - cl::desc("hoist common instructions (default = true)")); + "hoist-common-insts", cl::Hidden, cl::init(false), + cl::desc("hoist common instructions (default = false)")); static cl::opt UserSinkCommonInsts( "sink-common-insts", cl::Hidden, cl::init(false), diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index 7ff73fcdada79..a3d0c0421e949 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -69,7 +69,7 @@ RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, Module *M) { default: return RK; case Attribute::NonNull: - RK.WasOn = GetUnderlyingObject(RK.WasOn, M->getDataLayout()); + RK.WasOn = getUnderlyingObject(RK.WasOn); return RK; case Attribute::Alignment: { Value *V = RK.WasOn->stripInBoundsOffsets([&](const Value *Strip) { @@ -145,7 +145,7 @@ struct AssumeBuilderState { if (!RK.WasOn) return true; if (RK.WasOn->getType()->isPointerTy()) { - Value *UnderlyingPtr = GetUnderlyingObject(RK.WasOn, M->getDataLayout()); + Value *UnderlyingPtr = getUnderlyingObject(RK.WasOn); if (isa(UnderlyingPtr) || isa(UnderlyingPtr)) return false; } diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 19f655c3a78fb..2ef17dc595fa8 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -35,11 +35,11 @@ add_llvm_component_library(LLVMTransformUtils LCSSA.cpp LibCallsShrinkWrap.cpp Local.cpp + LoopPeel.cpp LoopRotationUtils.cpp LoopSimplify.cpp LoopUnroll.cpp LoopUnrollAndJam.cpp - LoopUnrollPeel.cpp LoopUnrollRuntime.cpp LoopUtils.cpp LoopVersioning.cpp diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index 788983c156903..957e4028bae77 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -147,6 +147,11 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, TypeMapper, Materializer)); } + // Everything else beyond this point deals with function instructions, + // so if we are dealing with a function declaration, we're done. + if (OldFunc->isDeclaration()) + return; + // When we remap instructions, we want to avoid duplicating inlined // DISubprograms, so record all subprograms we find as we duplicate // instructions and then freeze them in the MD map. diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 5c9cb1245d01c..30726627bc829 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1037,7 +1037,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, SmallSetVector NAPtrArgs; for (const Value *V : PtrArgs) { SmallVector Objects; - GetUnderlyingObjects(V, Objects, DL, /* LI = */ nullptr); + getUnderlyingObjects(V, Objects, /* LI = */ nullptr); for (const Value *O : Objects) ObjSet.insert(O); diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp index b1a1c564d2171..630aadadbbce6 100644 --- a/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PredIteratorCache.h" @@ -77,12 +78,15 @@ static bool isExitBlock(BasicBlock *BB, /// rewrite the uses. bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, const DominatorTree &DT, const LoopInfo &LI, - ScalarEvolution *SE) { + ScalarEvolution *SE, IRBuilderBase &Builder, + SmallVectorImpl *PHIsToRemove) { SmallVector UsesToRewrite; - SmallSetVector PHIsToRemove; + SmallSetVector LocalPHIsToRemove; PredIteratorCache PredCache; bool Changed = false; + IRBuilderBase::InsertPointGuard InsertPtGuard(Builder); + // Cache the Loop ExitBlocks across this loop. We expect to get a lot of // instructions within the same loops, computing the exit blocks is // expensive, and we're not mutating the loop structure. @@ -151,9 +155,9 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, // If we already inserted something for this BB, don't reprocess it. if (SSAUpdate.HasValueForBlock(ExitBB)) continue; - - PHINode *PN = PHINode::Create(I->getType(), PredCache.size(ExitBB), - I->getName() + ".lcssa", &ExitBB->front()); + Builder.SetInsertPoint(&ExitBB->front()); + PHINode *PN = Builder.CreatePHI(I->getType(), PredCache.size(ExitBB), + I->getName() + ".lcssa"); // Get the debug location from the original instruction. PN->setDebugLoc(I->getDebugLoc()); // Add inputs from inside the loop for this PHI. @@ -253,22 +257,28 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, SmallVector NeedDbgValues; for (PHINode *PN : AddedPHIs) if (PN->use_empty()) - PHIsToRemove.insert(PN); + LocalPHIsToRemove.insert(PN); else NeedDbgValues.push_back(PN); insertDebugValuesForPHIs(InstBB, NeedDbgValues); Changed = true; } - // Remove PHI nodes that did not have any uses rewritten. We need to redo the - // use_empty() check here, because even if the PHI node wasn't used when added - // to PHIsToRemove, later added PHI nodes can be using it. This cleanup is - // not guaranteed to handle trees/cycles of PHI nodes that only are used by - // each other. Such situations has only been noticed when the input IR - // contains unreachable code, and leaving some extra redundant PHI nodes in - // such situations is considered a minor problem. - for (PHINode *PN : PHIsToRemove) - if (PN->use_empty()) - PN->eraseFromParent(); + + // Remove PHI nodes that did not have any uses rewritten or add them to + // PHIsToRemove, so the caller can remove them after some additional cleanup. + // We need to redo the use_empty() check here, because even if the PHI node + // wasn't used when added to LocalPHIsToRemove, later added PHI nodes can be + // using it. This cleanup is not guaranteed to handle trees/cycles of PHI + // nodes that only are used by each other. Such situations has only been + // noticed when the input IR contains unreachable code, and leaving some extra + // redundant PHI nodes in such situations is considered a minor problem. + if (PHIsToRemove) { + PHIsToRemove->append(LocalPHIsToRemove.begin(), LocalPHIsToRemove.end()); + } else { + for (PHINode *PN : LocalPHIsToRemove) + if (PN->use_empty()) + PN->eraseFromParent(); + } return Changed; } @@ -369,7 +379,9 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI, Worklist.push_back(&I); } } - Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE); + + IRBuilder<> Builder(L.getHeader()->getContext()); + Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE, Builder); // If we modified the code, remove any caches about the loop from SCEV to // avoid dangling entries. diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 3d163b8a86bcc..bc4351eec5866 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3028,44 +3028,6 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { } } -using AllocaForValueMapTy = DenseMap; -AllocaInst *llvm::findAllocaForValue(Value *V, - AllocaForValueMapTy &AllocaForValue) { - if (AllocaInst *AI = dyn_cast(V)) - return AI; - // See if we've already calculated (or started to calculate) alloca for a - // given value. - AllocaForValueMapTy::iterator I = AllocaForValue.find(V); - if (I != AllocaForValue.end()) - return I->second; - // Store 0 while we're calculating alloca for value V to avoid - // infinite recursion if the value references itself. - AllocaForValue[V] = nullptr; - AllocaInst *Res = nullptr; - if (CastInst *CI = dyn_cast(V)) - Res = findAllocaForValue(CI->getOperand(0), AllocaForValue); - else if (PHINode *PN = dyn_cast(V)) { - for (Value *IncValue : PN->incoming_values()) { - // Allow self-referencing phi-nodes. - if (IncValue == PN) - continue; - AllocaInst *IncValueAI = findAllocaForValue(IncValue, AllocaForValue); - // AI for incoming values should exist and should all be equal. - if (IncValueAI == nullptr || (Res != nullptr && IncValueAI != Res)) - return nullptr; - Res = IncValueAI; - } - } else if (GetElementPtrInst *EP = dyn_cast(V)) { - Res = findAllocaForValue(EP->getPointerOperand(), AllocaForValue); - } else { - LLVM_DEBUG(dbgs() << "Alloca search cancelled on unknown instruction: " - << *V << "\n"); - } - if (Res) - AllocaForValue[V] = Res; - return Res; -} - Value *llvm::invertCondition(Value *Condition) { // First: Check if it's a constant if (Constant *C = dyn_cast(Condition)) diff --git a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp similarity index 92% rename from llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp rename to llvm/lib/Transforms/Utils/LoopPeel.cpp index c653aacbee6cc..ff9b9eca8fdf7 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -1,4 +1,4 @@ -//===- UnrollLoopPeel.cpp - Loop peeling utilities ------------------------===// +//===- LoopPeel.cpp -------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,12 +6,10 @@ // //===----------------------------------------------------------------------===// // -// This file implements some loop unrolling utilities for peeling loops -// with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for -// unrolling loops with compile-time constant trip counts. -// +// Loop Peeling Utilities. //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" @@ -49,10 +47,24 @@ using namespace llvm; using namespace llvm::PatternMatch; -#define DEBUG_TYPE "loop-unroll" +#define DEBUG_TYPE "loop-peel" STATISTIC(NumPeeled, "Number of loops peeled"); +static cl::opt UnrollPeelCount( + "unroll-peel-count", cl::Hidden, + cl::desc("Set the unroll peeling count, for testing purposes")); + +static cl::opt + UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden, + cl::desc("Allows loops to be peeled when the dynamic " + "trip count is known to be low.")); + +static cl::opt + UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling", + cl::init(false), cl::Hidden, + cl::desc("Allows loop nests to be peeled.")); + static cl::opt UnrollPeelMaxCount( "unroll-peel-max-count", cl::init(7), cl::Hidden, cl::desc("Max average trip count which will cause loop peeling.")); @@ -278,9 +290,9 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, // Return the number of iterations we want to peel off. void llvm::computePeelCount(Loop *L, unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, - unsigned &TripCount, ScalarEvolution &SE) { + unsigned &TripCount, ScalarEvolution &SE, + unsigned Threshold) { assert(LoopSize > 0 && "Zero loop size is not allowed!"); // Save the PP.PeelCount value set by the target in // TTI.getPeelingPreferences or by the flag -unroll-peel-count. @@ -322,7 +334,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, // maximum number of iterations among these values, thus turning all those // Phis into invariants. // First, check that we can peel at least one iteration. - if (2 * LoopSize <= UP.Threshold && UnrollPeelMaxCount > 0) { + if (2 * LoopSize <= Threshold && UnrollPeelMaxCount > 0) { // Store the pre-calculated values here. SmallDenseMap IterationsToInvariance; // Now go through all Phis to calculate their the number of iterations they @@ -342,7 +354,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, // Pay respect to limitations implied by loop size and the max peel count. unsigned MaxPeelCount = UnrollPeelMaxCount; - MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1); + MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1); DesiredPeelCount = std::max(DesiredPeelCount, countToEliminateCompares(*L, MaxPeelCount, SE)); @@ -385,7 +397,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, if (*PeelCount) { if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) && - (LoopSize * (*PeelCount + 1) <= UP.Threshold)) { + (LoopSize * (*PeelCount + 1) <= Threshold)) { LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n"); PP.PeelCount = *PeelCount; @@ -396,7 +408,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n"); LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n"); - LLVM_DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n"); + LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n"); } } } @@ -491,7 +503,7 @@ static void fixupBranchWeights(BasicBlock *Header, BranchInst *LatchBR, /// instructions in the last peeled-off iteration. static void cloneLoopBlocks( Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot, - SmallVectorImpl > &ExitEdges, + SmallVectorImpl> &ExitEdges, SmallVectorImpl &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT, LoopInfo *LI) { @@ -599,6 +611,40 @@ static void cloneLoopBlocks( LVMap[KV.first] = KV.second; } +TargetTransformInfo::PeelingPreferences llvm::gatherPeelingPreferences( + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + Optional UserAllowPeeling, + Optional UserAllowProfileBasedPeeling, bool UnrollingSpecficValues) { + TargetTransformInfo::PeelingPreferences PP; + + // Set the default values. + PP.PeelCount = 0; + PP.AllowPeeling = true; + PP.AllowLoopNestsPeeling = false; + PP.PeelProfiledIterations = true; + + // Get the target specifc values. + TTI.getPeelingPreferences(L, SE, PP); + + // User specified values using cl::opt. + if (UnrollingSpecficValues) { + if (UnrollPeelCount.getNumOccurrences() > 0) + PP.PeelCount = UnrollPeelCount; + if (UnrollAllowPeeling.getNumOccurrences() > 0) + PP.AllowPeeling = UnrollAllowPeeling; + if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0) + PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling; + } + + // User specifed values provided by argument. + if (UserAllowPeeling.hasValue()) + PP.AllowPeeling = *UserAllowPeeling; + if (UserAllowProfileBasedPeeling.hasValue()) + PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling; + + return PP; +} + /// Peel off the first \p PeelCount iterations of loop \p L. /// /// Note that this does not peel them off as a single straight-line block. @@ -609,8 +655,8 @@ static void cloneLoopBlocks( /// for the bulk of dynamic execution, can be further simplified by scalar /// optimizations. bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, - ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC, bool PreserveLCSSA) { + ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + bool PreserveLCSSA) { assert(PeelCount > 0 && "Attempt to peel out zero iterations?"); assert(canPeel(L) && "Attempt to peel a loop which is not peelable?"); diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 3875c631f839b..7bea696a853a9 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -59,6 +59,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopPeel.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" @@ -108,14 +109,15 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden, /// insert a phi-node, otherwise LCSSA will be broken. /// The function is just a helper function for llvm::UnrollLoop that returns /// true if this situation occurs, indicating that LCSSA needs to be fixed. -static bool needToInsertPhisForLCSSA(Loop *L, std::vector Blocks, +static bool needToInsertPhisForLCSSA(Loop *L, + const std::vector &Blocks, LoopInfo *LI) { for (BasicBlock *BB : Blocks) { if (LI->getLoopFor(BB) == L) continue; for (Instruction &I : *BB) { for (Use &U : I.operands()) { - if (auto Def = dyn_cast(U)) { + if (const auto *Def = dyn_cast(U)) { Loop *DefLoop = LI->getLoopFor(Def->getParent()); if (!DefLoop) continue; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 43363736684ee..d7cd9b19b8d51 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1145,7 +1145,7 @@ static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) { // producing an expression involving multiple pointers. Until then, we must // bail out here. // - // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject + // Retrieve the pointer operand of the GEP. Don't use getUnderlyingObject // because it understands lcssa phis while SCEV does not. Value *FromPtr = FromVal; Value *ToPtr = ToVal; @@ -1162,7 +1162,7 @@ static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) { // SCEV may have rewritten an expression that produces the GEP's pointer // operand. That's ok as long as the pointer operand has the same base - // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the + // pointer. Unlike getUnderlyingObject(), getPointerBase() will find the // base of a recurrence. This handles the case in which SCEV expansion // converts a pointer type recurrence into a nonrecurrent pointer base // indexed by an integer recurrence. diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 16bd08c704eeb..b4925064bc6b9 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -14,10 +14,15 @@ #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -253,47 +258,55 @@ void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst, } namespace { +bool runImpl(LoopInfo *LI, function_ref GetLAA, + DominatorTree *DT, ScalarEvolution *SE) { + // Build up a worklist of inner-loops to version. This is necessary as the + // act of versioning a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + const LoopAccessInfo &LAI = GetLAA(*L); + if (L->isLoopSimplifyForm() && !LAI.hasConvergentOp() && + (LAI.getNumRuntimePointerChecks() || + !LAI.getPSE().getUnionPredicate().isAlwaysTrue())) { + LoopVersioning LVer(LAI, L, LI, DT, SE); + LVer.versionLoop(); + LVer.annotateLoopWithNoAlias(); + Changed = true; + } + } + + return Changed; +} + /// Also expose this is a pass. Currently this is only used for /// unit-testing. It adds all memchecks necessary to remove all may-aliasing /// array accesses from the loop. -class LoopVersioningPass : public FunctionPass { +class LoopVersioningLegacyPass : public FunctionPass { public: - LoopVersioningPass() : FunctionPass(ID) { - initializeLoopVersioningPassPass(*PassRegistry::getPassRegistry()); + LoopVersioningLegacyPass() : FunctionPass(ID) { + initializeLoopVersioningLegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override { auto *LI = &getAnalysis().getLoopInfo(); - auto *LAA = &getAnalysis(); + auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & { + return getAnalysis().getInfo(&L); + }; + auto *DT = &getAnalysis().getDomTree(); auto *SE = &getAnalysis().getSE(); - // Build up a worklist of inner-loops to version. This is necessary as the - // act of versioning a loop creates new loops and can invalidate iterators - // across the loops. - SmallVector Worklist; - - for (Loop *TopLevelLoop : *LI) - for (Loop *L : depth_first(TopLevelLoop)) - // We only handle inner-most loops. - if (L->empty()) - Worklist.push_back(L); - - // Now walk the identified inner loops. - bool Changed = false; - for (Loop *L : Worklist) { - const LoopAccessInfo &LAI = LAA->getInfo(L); - if (L->isLoopSimplifyForm() && !LAI.hasConvergentOp() && - (LAI.getNumRuntimePointerChecks() || - !LAI.getPSE().getUnionPredicate().isAlwaysTrue())) { - LoopVersioning LVer(LAI, L, LI, DT, SE); - LVer.versionLoop(); - LVer.annotateLoopWithNoAlias(); - Changed = true; - } - } - - return Changed; + return runImpl(LI, GetLAA, DT, SE); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -312,18 +325,44 @@ class LoopVersioningPass : public FunctionPass { #define LVER_OPTION "loop-versioning" #define DEBUG_TYPE LVER_OPTION -char LoopVersioningPass::ID; +char LoopVersioningLegacyPass::ID; static const char LVer_name[] = "Loop Versioning"; -INITIALIZE_PASS_BEGIN(LoopVersioningPass, LVER_OPTION, LVer_name, false, false) +INITIALIZE_PASS_BEGIN(LoopVersioningLegacyPass, LVER_OPTION, LVer_name, false, + false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(LoopVersioningPass, LVER_OPTION, LVer_name, false, false) +INITIALIZE_PASS_END(LoopVersioningLegacyPass, LVER_OPTION, LVer_name, false, + false) namespace llvm { -FunctionPass *createLoopVersioningPass() { - return new LoopVersioningPass(); +FunctionPass *createLoopVersioningLegacyPass() { + return new LoopVersioningLegacyPass(); } + +PreservedAnalyses LoopVersioningPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &SE = AM.getResult(F); + auto &LI = AM.getResult(F); + auto &TTI = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &TLI = AM.getResult(F); + auto &AA = AM.getResult(F); + auto &AC = AM.getResult(F); + MemorySSA *MSSA = EnableMSSALoopDependency + ? &AM.getResult(F).getMSSA() + : nullptr; + + auto &LAM = AM.getResult(F).getManager(); + auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & { + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; + return LAM.getResult(L, AR); + }; + + if (runImpl(&LI, GetLAA, &DT, &SE)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); } +} // namespace llvm diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index c7e9c919ec471..bfb915eca1f3a 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -62,10 +62,6 @@ STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); bool llvm::isAllocaPromotable(const AllocaInst *AI) { - // FIXME: If the memory unit is of pointer or integer type, we can permit - // assignments to subsections of the memory unit. - unsigned AS = AI->getType()->getAddressSpace(); - // Only allow direct and non-volatile loads and stores... for (const User *U : AI->users()) { if (const LoadInst *LI = dyn_cast(U)) { @@ -81,19 +77,18 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) { if (SI->isVolatile()) return false; } else if (const IntrinsicInst *II = dyn_cast(U)) { - if (!II->isLifetimeStartOrEnd()) + if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) return false; } else if (const BitCastInst *BCI = dyn_cast(U)) { - if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) - return false; - if (!onlyUsedByLifetimeMarkers(BCI)) + if (!onlyUsedByLifetimeMarkersOrDroppableInsts(BCI)) return false; } else if (const GetElementPtrInst *GEPI = dyn_cast(U)) { - if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS)) - return false; if (!GEPI->hasAllZeroIndices()) return false; - if (!onlyUsedByLifetimeMarkers(GEPI)) + if (!onlyUsedByLifetimeMarkersOrDroppableInsts(GEPI)) + return false; + } else if (const AddrSpaceCastInst *ASCI = dyn_cast(U)) { + if (!onlyUsedByLifetimeMarkers(ASCI)) return false; } else { return false; @@ -312,23 +307,37 @@ static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) { AC->registerAssumption(CI); } -static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { +static void removeIntrinsicUsers(AllocaInst *AI) { // Knowing that this alloca is promotable, we know that it's safe to kill all // instructions except for load and store. - for (auto UI = AI->user_begin(), UE = AI->user_end(); UI != UE;) { - Instruction *I = cast(*UI); + for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) { + Instruction *I = cast(UI->getUser()); + Use &U = *UI; ++UI; if (isa(I) || isa(I)) continue; + // Drop the use of AI in droppable instructions. + if (I->isDroppable()) { + I->dropDroppableUse(U); + continue; + } + if (!I->getType()->isVoidTy()) { // The only users of this bitcast/GEP instruction are lifetime intrinsics. // Follow the use/def chain to erase them now instead of leaving it for // dead code elimination later. - for (auto UUI = I->user_begin(), UUE = I->user_end(); UUI != UUE;) { - Instruction *Inst = cast(*UUI); + for (auto UUI = I->use_begin(), UUE = I->use_end(); UUI != UUE;) { + Instruction *Inst = cast(UUI->getUser()); + Use &UU = *UUI; ++UUI; + + // Drop the use of I in droppable instructions. + if (Inst->isDroppable()) { + Inst->dropDroppableUse(UU); + continue; + } Inst->eraseFromParent(); } } @@ -544,7 +553,7 @@ void PromoteMem2Reg::run() { assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); - removeLifetimeIntrinsicUsers(AI); + removeIntrinsicUsers(AI); if (AI->use_empty()) { // If there are no uses of the alloca, just delete it now. diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 3527ca4bbbbcc..aaa28feb32b86 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -27,6 +27,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -461,9 +462,10 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, // we didn't find any operands that could be factored, tentatively // assume that element zero was selected (since the zero offset // would obviously be folded away). - Value *Scaled = ScaledOps.empty() ? - Constant::getNullValue(Ty) : - expandCodeFor(SE.getAddExpr(ScaledOps), Ty); + Value *Scaled = + ScaledOps.empty() + ? Constant::getNullValue(Ty) + : expandCodeForImpl(SE.getAddExpr(ScaledOps), Ty, false); GepIndices.push_back(Scaled); // Collect struct field index operands. @@ -522,7 +524,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, SE.DT.dominates(cast(V), &*Builder.GetInsertPoint())); // Expand the operands for a plain byte offset. - Value *Idx = expandCodeFor(SE.getAddExpr(Ops), Ty); + Value *Idx = expandCodeForImpl(SE.getAddExpr(Ops), Ty, false); // Fold a GEP with constant operands. if (Constant *CLHS = dyn_cast(V)) @@ -743,14 +745,14 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) { Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op)); } else if (Op->isNonConstantNegative()) { // Instead of doing a negate and add, just do a subtract. - Value *W = expandCodeFor(SE.getNegativeSCEV(Op), Ty); + Value *W = expandCodeForImpl(SE.getNegativeSCEV(Op), Ty, false); Sum = InsertNoopCastOfTo(Sum, Ty); Sum = InsertBinop(Instruction::Sub, Sum, W, SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true); ++I; } else { // A simple add. - Value *W = expandCodeFor(Op, Ty); + Value *W = expandCodeForImpl(Op, Ty, false); Sum = InsertNoopCastOfTo(Sum, Ty); // Canonicalize a constant to the RHS. if (isa(Sum)) std::swap(Sum, W); @@ -802,7 +804,7 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) { // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them // that are needed into the result. - Value *P = expandCodeFor(I->second, Ty); + Value *P = expandCodeForImpl(I->second, Ty, false); Value *Result = nullptr; if (Exponent & 1) Result = P; @@ -861,7 +863,7 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) { Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) { Type *Ty = SE.getEffectiveSCEVType(S->getType()); - Value *LHS = expandCodeFor(S->getLHS(), Ty); + Value *LHS = expandCodeForImpl(S->getLHS(), Ty, false); if (const SCEVConstant *SC = dyn_cast(S->getRHS())) { const APInt &RHS = SC->getAPInt(); if (RHS.isPowerOf2()) @@ -870,7 +872,7 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) { SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true); } - Value *RHS = expandCodeFor(S->getRHS(), Ty); + Value *RHS = expandCodeForImpl(S->getRHS(), Ty, false); return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap, /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS())); } @@ -1185,6 +1187,14 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, if (!SE.isSCEVable(PN.getType())) continue; + // We should not look for a incomplete PHI. Getting SCEV for a incomplete + // PHI has no meaning at all. + if (!PN.isComplete()) { + DEBUG_WITH_TYPE( + DebugType, dbgs() << "One incomplete PHI is found: " << PN << "\n"); + continue; + } + const SCEVAddRecExpr *PhiSCEV = dyn_cast(SE.getSCEV(&PN)); if (!PhiSCEV) continue; @@ -1265,8 +1275,9 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, // Expand code for the start value into the loop preheader. assert(L->getLoopPreheader() && "Can't expand add recurrences without a loop preheader!"); - Value *StartV = expandCodeFor(Normalized->getStart(), ExpandTy, - L->getLoopPreheader()->getTerminator()); + Value *StartV = + expandCodeForImpl(Normalized->getStart(), ExpandTy, + L->getLoopPreheader()->getTerminator(), false); // StartV must have been be inserted into L's preheader to dominate the new // phi. @@ -1284,8 +1295,8 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, if (useSubtract) Step = SE.getNegativeSCEV(Step); // Expand the step somewhere that dominates the loop header. - Value *StepV = expandCodeFor(Step, IntTy, - &*L->getHeader()->getFirstInsertionPt()); + Value *StepV = expandCodeForImpl( + Step, IntTy, &*L->getHeader()->getFirstInsertionPt(), false); // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if // we actually do emit an addition. It does not apply if we emit a @@ -1430,8 +1441,8 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { { // Expand the step somewhere that dominates the loop header. SCEVInsertPointGuard Guard(Builder, this); - StepV = expandCodeFor(Step, IntTy, - &*L->getHeader()->getFirstInsertionPt()); + StepV = expandCodeForImpl( + Step, IntTy, &*L->getHeader()->getFirstInsertionPt(), false); } Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract); } @@ -1450,8 +1461,8 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { // Invert the result. if (InvertStep) - Result = Builder.CreateSub(expandCodeFor(Normalized->getStart(), TruncTy), - Result); + Result = Builder.CreateSub( + expandCodeForImpl(Normalized->getStart(), TruncTy, false), Result); } // Re-apply any non-loop-dominating scale. @@ -1459,22 +1470,22 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { assert(S->isAffine() && "Can't linearly scale non-affine recurrences."); Result = InsertNoopCastOfTo(Result, IntTy); Result = Builder.CreateMul(Result, - expandCodeFor(PostLoopScale, IntTy)); + expandCodeForImpl(PostLoopScale, IntTy, false)); } // Re-apply any non-loop-dominating offset. if (PostLoopOffset) { if (PointerType *PTy = dyn_cast(ExpandTy)) { if (Result->getType()->isIntegerTy()) { - Value *Base = expandCodeFor(PostLoopOffset, ExpandTy); + Value *Base = expandCodeForImpl(PostLoopOffset, ExpandTy, false); Result = expandAddToGEP(SE.getUnknown(Result), PTy, IntTy, Base); } else { Result = expandAddToGEP(PostLoopOffset, PTy, IntTy, Result); } } else { Result = InsertNoopCastOfTo(Result, IntTy); - Result = Builder.CreateAdd(Result, - expandCodeFor(PostLoopOffset, IntTy)); + Result = Builder.CreateAdd( + Result, expandCodeForImpl(PostLoopOffset, IntTy, false)); } } @@ -1516,8 +1527,8 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { S->getNoWrapFlags(SCEV::FlagNW))); BasicBlock::iterator NewInsertPt = findInsertPointAfter(cast(V), Builder.GetInsertBlock()); - V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr, - &*NewInsertPt); + V = expandCodeForImpl(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr, + &*NewInsertPt, false); return V; } @@ -1632,22 +1643,25 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) { Type *Ty = SE.getEffectiveSCEVType(S->getType()); - Value *V = expandCodeFor(S->getOperand(), - SE.getEffectiveSCEVType(S->getOperand()->getType())); + Value *V = expandCodeForImpl( + S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()), + false); return Builder.CreateTrunc(V, Ty); } Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) { Type *Ty = SE.getEffectiveSCEVType(S->getType()); - Value *V = expandCodeFor(S->getOperand(), - SE.getEffectiveSCEVType(S->getOperand()->getType())); + Value *V = expandCodeForImpl( + S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()), + false); return Builder.CreateZExt(V, Ty); } Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) { Type *Ty = SE.getEffectiveSCEVType(S->getType()); - Value *V = expandCodeFor(S->getOperand(), - SE.getEffectiveSCEVType(S->getOperand()->getType())); + Value *V = expandCodeForImpl( + S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()), + false); return Builder.CreateSExt(V, Ty); } @@ -1662,7 +1676,7 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { Ty = SE.getEffectiveSCEVType(Ty); LHS = InsertNoopCastOfTo(LHS, Ty); } - Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false); Value *ICmp = Builder.CreateICmpSGT(LHS, RHS); Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax"); LHS = Sel; @@ -1685,7 +1699,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { Ty = SE.getEffectiveSCEVType(Ty); LHS = InsertNoopCastOfTo(LHS, Ty); } - Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false); Value *ICmp = Builder.CreateICmpUGT(LHS, RHS); Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax"); LHS = Sel; @@ -1708,7 +1722,7 @@ Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { Ty = SE.getEffectiveSCEVType(Ty); LHS = InsertNoopCastOfTo(LHS, Ty); } - Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false); Value *ICmp = Builder.CreateICmpSLT(LHS, RHS); Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin"); LHS = Sel; @@ -1731,7 +1745,7 @@ Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { Ty = SE.getEffectiveSCEVType(Ty); LHS = InsertNoopCastOfTo(LHS, Ty); } - Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false); Value *ICmp = Builder.CreateICmpULT(LHS, RHS); Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin"); LHS = Sel; @@ -1743,15 +1757,45 @@ Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { return LHS; } -Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty, - Instruction *IP) { +Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, + Instruction *IP, bool Root) { setInsertPoint(IP); - return expandCodeFor(SH, Ty); + Value *V = expandCodeForImpl(SH, Ty, Root); + return V; } -Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) { +Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) { // Expand the code for this SCEV. Value *V = expand(SH); + + if (PreserveLCSSA) { + if (auto *Inst = dyn_cast(V)) { + // Create a temporary instruction to at the current insertion point, so we + // can hand it off to the helper to create LCSSA PHIs if required for the + // new use. + // FIXME: Ideally formLCSSAForInstructions (used in fixupLCSSAFormFor) + // would accept a insertion point and return an LCSSA phi for that + // insertion point, so there is no need to insert & remove the temporary + // instruction. + Instruction *Tmp; + if (Inst->getType()->isIntegerTy()) + Tmp = + cast(Builder.CreateAdd(Inst, Inst, "tmp.lcssa.user")); + else { + assert(Inst->getType()->isPointerTy()); + Tmp = cast( + Builder.CreateGEP(Inst, Builder.getInt32(1), "tmp.lcssa.user")); + } + V = fixupLCSSAFormFor(Tmp, 0); + + // Clean up temporary instruction. + InsertedValues.erase(Tmp); + InsertedPostIncValues.erase(Tmp); + Tmp->eraseFromParent(); + } + } + + InsertedExpressions[std::make_pair(SH, &*Builder.GetInsertPoint())] = V; if (Ty) { assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) && "non-trivial casts should be done with the SCEVs directly!"); @@ -1891,10 +1935,25 @@ Value *SCEVExpander::expand(const SCEV *S) { } void SCEVExpander::rememberInstruction(Value *I) { - if (!PostIncLoops.empty()) - InsertedPostIncValues.insert(I); - else - InsertedValues.insert(I); + auto DoInsert = [this](Value *V) { + if (!PostIncLoops.empty()) + InsertedPostIncValues.insert(V); + else + InsertedValues.insert(V); + }; + DoInsert(I); + + if (!PreserveLCSSA) + return; + + if (auto *Inst = dyn_cast(I)) { + // A new instruction has been added, which might introduce new uses outside + // a defining loop. Fix LCSSA from for each operand of the new instruction, + // if required. + for (unsigned OpIdx = 0, OpEnd = Inst->getNumOperands(); OpIdx != OpEnd; + OpIdx++) + fixupLCSSAFormFor(Inst, OpIdx); + } } /// getOrInsertCanonicalInductionVariable - This method returns the @@ -1913,9 +1972,8 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L, // Emit code for it. SCEVInsertPointGuard Guard(Builder, this); - PHINode *V = - cast(expandCodeFor(H, nullptr, - &*L->getHeader()->getFirstInsertionPt())); + PHINode *V = cast(expandCodeForImpl( + H, nullptr, &*L->getHeader()->getFirstInsertionPt(), false)); return V; } @@ -2049,6 +2107,8 @@ SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, } DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv: " << *Phi << '\n'); + DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Original iv: " + << *OrigPhiRef << '\n'); ++NumElim; Value *NewIV = OrigPhiRef; if (OrigPhiRef->getType() != Phi->getType()) { @@ -2150,8 +2210,9 @@ bool SCEVExpander::isHighCostExpansionHelper( llvm_unreachable("There are no other cast types."); } const SCEV *Op = CastExpr->getOperand(); - BudgetRemaining -= TTI.getCastInstrCost(Opcode, /*Dst=*/S->getType(), - /*Src=*/Op->getType(), CostKind); + BudgetRemaining -= TTI.getCastInstrCost( + Opcode, /*Dst=*/S->getType(), + /*Src=*/Op->getType(), TTI::CastContextHint::None, CostKind); Worklist.emplace_back(Op); return false; // Will answer upon next entry into this function. } @@ -2314,8 +2375,10 @@ Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred, Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred, Instruction *IP) { - Value *Expr0 = expandCodeFor(Pred->getLHS(), Pred->getLHS()->getType(), IP); - Value *Expr1 = expandCodeFor(Pred->getRHS(), Pred->getRHS()->getType(), IP); + Value *Expr0 = + expandCodeForImpl(Pred->getLHS(), Pred->getLHS()->getType(), IP, false); + Value *Expr1 = + expandCodeForImpl(Pred->getRHS(), Pred->getRHS()->getType(), IP, false); Builder.SetInsertPoint(IP); auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check"); @@ -2347,15 +2410,16 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, IntegerType *CountTy = IntegerType::get(Loc->getContext(), SrcBits); Builder.SetInsertPoint(Loc); - Value *TripCountVal = expandCodeFor(ExitCount, CountTy, Loc); + Value *TripCountVal = expandCodeForImpl(ExitCount, CountTy, Loc, false); IntegerType *Ty = IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy)); Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty; - Value *StepValue = expandCodeFor(Step, Ty, Loc); - Value *NegStepValue = expandCodeFor(SE.getNegativeSCEV(Step), Ty, Loc); - Value *StartValue = expandCodeFor(Start, ARExpandTy, Loc); + Value *StepValue = expandCodeForImpl(Step, Ty, Loc, false); + Value *NegStepValue = + expandCodeForImpl(SE.getNegativeSCEV(Step), Ty, Loc, false); + Value *StartValue = expandCodeForImpl(Start, ARExpandTy, Loc, false); ConstantInt *Zero = ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits)); @@ -2458,6 +2522,34 @@ Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union, return Check; } +Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) { + assert(PreserveLCSSA); + SmallVector ToUpdate; + + auto *OpV = User->getOperand(OpIdx); + auto *OpI = dyn_cast(OpV); + if (!OpI) + return OpV; + + Loop *DefLoop = SE.LI.getLoopFor(OpI->getParent()); + Loop *UseLoop = SE.LI.getLoopFor(User->getParent()); + if (!DefLoop || UseLoop == DefLoop || DefLoop->contains(UseLoop)) + return OpV; + + ToUpdate.push_back(OpI); + SmallVector PHIsToRemove; + formLCSSAForInstructions(ToUpdate, SE.DT, SE.LI, &SE, Builder, &PHIsToRemove); + for (PHINode *PN : PHIsToRemove) { + if (!PN->use_empty()) + continue; + InsertedValues.erase(PN); + InsertedPostIncValues.erase(PN); + PN->eraseFromParent(); + } + + return User->getOperand(OpIdx); +} + namespace { // Search for a SCEV subexpression that is not safe to expand. Any expression // that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index cfcc3454a2102..b03389aec7bf4 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -20,7 +20,6 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/CaptureTracking.h" diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp index 6ff08cd287124..1939c0e3b504b 100644 --- a/llvm/lib/Transforms/Utils/VNCoercion.cpp +++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp @@ -393,7 +393,7 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, if (!Src) return -1; - GlobalVariable *GV = dyn_cast(GetUnderlyingObject(Src, DL)); + GlobalVariable *GV = dyn_cast(getUnderlyingObject(Src)); if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) return -1; diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 9b81afbb4b6cb..26ffb044a35c2 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -762,8 +762,8 @@ Vectorizer::getVectorizablePrefix(ArrayRef Chain) { return Chain.slice(0, ChainIdx); } -static ChainID getChainID(const Value *Ptr, const DataLayout &DL) { - const Value *ObjPtr = GetUnderlyingObject(Ptr, DL); +static ChainID getChainID(const Value *Ptr) { + const Value *ObjPtr = getUnderlyingObject(Ptr); if (const auto *Sel = dyn_cast(ObjPtr)) { // The select's themselves are distinct instructions even if they share the // same condition and evaluate to consecutive pointers for true and false @@ -830,7 +830,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save the load locations. - const ChainID ID = getChainID(Ptr, DL); + const ChainID ID = getChainID(Ptr); LoadRefs[ID].push_back(LI); } else if (StoreInst *SI = dyn_cast(&I)) { if (!SI->isSimple()) @@ -876,7 +876,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save store location. - const ChainID ID = getChainID(Ptr, DL); + const ChainID ID = getChainID(Ptr); StoreRefs[ID].push_back(SI); } } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 120b544808bed..e2a95d6f67a86 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -13,9 +13,11 @@ // pass. It should be easy to create an analysis pass around it if there // is a need (but D45420 needs to happen first). // + #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d207ca250783f..33bd31f6b9833 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6458,13 +6458,54 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { + // Computes the CastContextHint from a Load/Store instruction. + auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { + assert((isa(I) || isa(I)) && + "Expected a load or a store!"); + + if (VF == 1 || !TheLoop->contains(I)) + return TTI::CastContextHint::Normal; + + switch (getWideningDecision(I, VF)) { + case LoopVectorizationCostModel::CM_GatherScatter: + return TTI::CastContextHint::GatherScatter; + case LoopVectorizationCostModel::CM_Interleave: + return TTI::CastContextHint::Interleave; + case LoopVectorizationCostModel::CM_Scalarize: + case LoopVectorizationCostModel::CM_Widen: + return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; + case LoopVectorizationCostModel::CM_Widen_Reverse: + return TTI::CastContextHint::Reversed; + case LoopVectorizationCostModel::CM_Unknown: + llvm_unreachable("Instr did not go through cost modelling?"); + } + + llvm_unreachable("Unhandled case!"); + }; + + unsigned Opcode = I->getOpcode(); + TTI::CastContextHint CCH = TTI::CastContextHint::None; + // For Trunc, the context is the only user, which must be a StoreInst. + if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { + if (I->hasOneUse()) + if (StoreInst *Store = dyn_cast(*I->user_begin())) + CCH = ComputeCCH(Store); + } + // For Z/Sext, the context is the operand, which must be a LoadInst. + else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || + Opcode == Instruction::FPExt) { + if (LoadInst *Load = dyn_cast(I->getOperand(0))) + CCH = ComputeCCH(Load); + } + // We optimize the truncation of induction variables having constant // integer steps. The cost of these truncations is the same as the scalar // operation. if (isOptimizableIVTruncate(I, VF)) { auto *Trunc = cast(I); return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), - Trunc->getSrcTy(), CostKind, Trunc); + Trunc->getSrcTy(), CCH, CostKind, Trunc); } Type *SrcScalarTy = I->getOperand(0)->getType(); @@ -6477,12 +6518,11 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // // Calculate the modified src and dest types. Type *MinVecTy = VectorTy; - if (I->getOpcode() == Instruction::Trunc) { + if (Opcode == Instruction::Trunc) { SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); - } else if (I->getOpcode() == Instruction::ZExt || - I->getOpcode() == Instruction::SExt) { + } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); @@ -6490,8 +6530,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; - return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, - CostKind, I); + return N * + TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } case Instruction::Call: { bool NeedToScalarize; @@ -7419,7 +7459,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. - if (CM.foldTailByMasking()) { + if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { Builder.setInsertPoint(VPBB); auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); for (auto &Reduction : Legal->getReductionVars()) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5fb8ad56d8b3e..b5b3fa59b8e92 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3399,8 +3399,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { Ext->getOpcode(), Ext->getType(), VecTy, i); // Add back the cost of s|zext which is subtracted separately. DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), CostKind, - Ext); + Ext->getOpcode(), Ext->getType(), E->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } @@ -3424,8 +3424,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind, - VL0); + TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, + TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -3437,9 +3437,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { int VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, - CostKind, VL0); + VecCost = + ReuseShuffleCost + + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); } return VecCost - ScalarCost; } @@ -3644,9 +3645,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, - CostKind); + TTI::CastContextHint::None, CostKind); VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, - CostKind); + TTI::CastContextHint::None, CostKind); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -5911,7 +5912,7 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { continue; if (!isValidElementType(SI->getValueOperand()->getType())) continue; - Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI); + Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); } // Ignore getelementptr instructions that have more than one index, a @@ -7397,8 +7398,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Look for the next elements with the same type. SmallVector::iterator SameTypeIt = IncIt; Type *EltTy = (*IncIt)->getType(); - unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy) - : MaxVecRegSize; + + assert(EltTy->isSized() && + "Instructions should all be sized at this point"); + TypeSize EltTS = DL->getTypeSizeInBits(EltTy); + if (EltTS.isScalable()) { + // For now, just ignore vectorizing scalable types. + ++IncIt; + continue; + } + + unsigned EltSize = EltTS.getFixedSize(); unsigned MaxNumElts = MaxVecRegSize / EltSize; if (MaxNumElts < 2) { ++IncIt; diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index 9019ed15ec5ff..b072ca9c39206 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -124,7 +124,7 @@ bool VPlanSlp::areVectorizable(ArrayRef Operands) const { for (auto &I : *Parent) { auto *VPI = cast(&I); if (VPI->getOpcode() == Instruction::Load && - std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) + llvm::is_contained(Operands, VPI)) LoadsSeen++; if (LoadsSeen == Operands.size()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index b384c94121e9b..6eec8d14de4af 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -65,9 +65,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { for (const VPBlockBase *Succ : Successors) { // There must be a bi-directional link between block and successor. const auto &SuccPreds = Succ->getPredecessors(); - assert(std::find(SuccPreds.begin(), SuccPreds.end(), VPB) != - SuccPreds.end() && - "Missing predecessor link."); + assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link."); (void)SuccPreds; } @@ -86,9 +84,7 @@ static void verifyBlocksInRegion(const VPRegionBlock *Region) { // There must be a bi-directional link between block and predecessor. const auto &PredSuccs = Pred->getSuccessors(); - assert(std::find(PredSuccs.begin(), PredSuccs.end(), VPB) != - PredSuccs.end() && - "Missing successor link."); + assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link."); (void)PredSuccs; } } diff --git a/llvm/test/Analysis/BasicAA/phi-loop.ll b/llvm/test/Analysis/BasicAA/phi-loop.ll index db3023c6560d0..e54752a9223f7 100644 --- a/llvm/test/Analysis/BasicAA/phi-loop.ll +++ b/llvm/test/Analysis/BasicAA/phi-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -basic-aa-recphi=1 -gvn -S | FileCheck %s +; RUN: opt < %s -basic-aa -gvn -S | FileCheck %s ; ; Check that section->word_ofs doesn't get reloaded in every iteration of the ; for loop. diff --git a/llvm/test/Analysis/BasicAA/phi-spec-order.ll b/llvm/test/Analysis/BasicAA/phi-spec-order.ll index f8586f094c2ce..e5d435c09ccc7 100644 --- a/llvm/test/Analysis/BasicAA/phi-spec-order.ll +++ b/llvm/test/Analysis/BasicAA/phi-spec-order.ll @@ -1,5 +1,5 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; RUN: opt < %s -basic-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s @X = external global [16000 x double], align 32 diff --git a/llvm/test/Analysis/BasicAA/recphi.ll b/llvm/test/Analysis/BasicAA/recphi.ll index dfc88937bf699..a6157f22a4f97 100644 --- a/llvm/test/Analysis/BasicAA/recphi.ll +++ b/llvm/test/Analysis/BasicAA/recphi.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -aa-eval -print-all-alias-modref-info -basic-aa-recphi -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -basic-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s ; CHECK-LABEL: Function: simple: 5 pointers, 0 call sites ; CHECK: NoAlias: float* %src1, float* %src2 diff --git a/llvm/test/Analysis/ConstantFolding/abs.ll b/llvm/test/Analysis/ConstantFolding/abs.ll new file mode 100644 index 0000000000000..7b3a146e86143 --- /dev/null +++ b/llvm/test/Analysis/ConstantFolding/abs.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -constprop -S | FileCheck %s + +declare i8 @llvm.abs.i8(i8, i1) +declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1) + +define i8 @undef_val_min_poison() { +; CHECK-LABEL: @undef_val_min_poison( +; CHECK-NEXT: ret i8 undef +; + %r = call i8 @llvm.abs.i8(i8 undef, i1 true) + ret i8 %r +} + +define i8 @undef_val_min_not_poison() { +; CHECK-LABEL: @undef_val_min_not_poison( +; CHECK-NEXT: ret i8 0 +; + %r = call i8 @llvm.abs.i8(i8 undef, i1 false) + ret i8 %r +} + +define i8 @min_val_min_poison() { +; CHECK-LABEL: @min_val_min_poison( +; CHECK-NEXT: ret i8 undef +; + %r = call i8 @llvm.abs.i8(i8 -128, i1 true) + ret i8 %r +} + +define i8 @min_val_min_not_poison() { +; CHECK-LABEL: @min_val_min_not_poison( +; CHECK-NEXT: ret i8 -128 +; + %r = call i8 @llvm.abs.i8(i8 -128, i1 false) + ret i8 %r +} + +define <8 x i8> @vec_const() { +; CHECK-LABEL: @vec_const( +; CHECK-NEXT: ret <8 x i8> +; + %r = call <8 x i8> @llvm.abs.v8i8(<8 x i8> , i1 1) + ret <8 x i8> %r +} diff --git a/llvm/test/Analysis/ConstantFolding/freeze.ll b/llvm/test/Analysis/ConstantFolding/freeze.ll new file mode 100644 index 0000000000000..4908f71b924ec --- /dev/null +++ b/llvm/test/Analysis/ConstantFolding/freeze.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -constprop -S | FileCheck %s + +@g = external global i16, align 1 +@g2 = external global i16, align 1 + +define i64 @ptrdiff1() { +; CHECK-LABEL: @ptrdiff1( +; CHECK-NEXT: ret i64 sub (i64 ptrtoint (i16* @g to i64), i64 ptrtoint (i16* @g2 to i64)) +; + %i = ptrtoint i16* @g to i64 + %i2 = ptrtoint i16* @g2 to i64 + %diff = sub i64 %i, %i2 + %r = freeze i64 %diff + ret i64 %r +} + +define i64 @ptrdiff2() { +; CHECK-LABEL: @ptrdiff2( +; CHECK-NEXT: ret i64 -2 +; + %i = ptrtoint i16* @g to i64 + %gep = getelementptr i16, i16* @g, i64 1 + %i2 = ptrtoint i16* %gep to i64 + %diff = sub i64 %i, %i2 + %r = freeze i64 %diff + ret i64 %r +} diff --git a/llvm/test/Analysis/ConstantFolding/min-max.ll b/llvm/test/Analysis/ConstantFolding/min-max.ll index b872e4a1aca8c..c01fd23683d1b 100644 --- a/llvm/test/Analysis/ConstantFolding/min-max.ll +++ b/llvm/test/Analysis/ConstantFolding/min-max.ll @@ -1,136 +1,202 @@ -; RUN: opt -instcombine -S -o - %s | FileCheck %s -; Tests that constant folding of min and max operations works as expected. +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -constprop -S < %s | FileCheck %s declare float @llvm.minnum.f32(float, float) -declare float @llvm.maxnum.f32(float, float) declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) + +declare float @llvm.maxnum.f32(float, float) declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) declare float @llvm.minimum.f32(float, float) -declare float @llvm.maximum.f32(float, float) declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>) + +declare float @llvm.maximum.f32(float, float) declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>) -; CHECK: define float @minnum_float() { +declare i8 @llvm.smax.i8(i8, i8) +declare <5 x i8> @llvm.smax.v5i8(<5 x i8>, <5 x i8>) + +declare i8 @llvm.smin.i8(i8, i8) +declare <5 x i8> @llvm.smin.v5i8(<5 x i8>, <5 x i8>) + +declare i8 @llvm.umax.i8(i8, i8) +declare <5 x i8> @llvm.umax.v5i8(<5 x i8>, <5 x i8>) + +declare i8 @llvm.umin.i8(i8, i8) +declare <5 x i8> @llvm.umin.v5i8(<5 x i8>, <5 x i8>) + define float @minnum_float() { - ; CHECK-NEXT: ret float 5.000000e+00 +; CHECK-LABEL: @minnum_float( +; CHECK-NEXT: ret float 5.000000e+00 +; %1 = call float @llvm.minnum.f32(float 5.0, float 42.0) ret float %1 } ; Check that minnum constant folds to propagate non-NaN or smaller argument -; CHECK: define <4 x float> @minnum_float_vec() { + define <4 x float> @minnum_float_vec() { - ; CHECK-NEXT: ret <4 x float> - %1 = call <4 x float> @llvm.minnum.v4f32( - <4 x float> , - <4 x float> - ) +; CHECK-LABEL: @minnum_float_vec( +; CHECK-NEXT: ret <4 x float> +; + %1 = call <4 x float> @llvm.minnum.v4f32(<4 x float> , <4 x float> ) ret <4 x float> %1 } ; Check that minnum constant folds to propagate one of its argument zeros -; CHECK: define <4 x float> @minnum_float_zeros_vec() { + define <4 x float> @minnum_float_zeros_vec() { - ; CHECK-NEXT: ret <4 x float> - %1 = call <4 x float> @llvm.minnum.v4f32( - <4 x float> , - <4 x float> - ) +; CHECK-LABEL: @minnum_float_zeros_vec( +; CHECK-NEXT: ret <4 x float> +; + %1 = call <4 x float> @llvm.minnum.v4f32(<4 x float> , <4 x float> ) ret <4 x float> %1 } -; CHECK: define float @maxnum_float() { define float @maxnum_float() { - ; CHECK-NEXT: ret float 4.200000e+01 +; CHECK-LABEL: @maxnum_float( +; CHECK-NEXT: ret float 4.200000e+01 +; %1 = call float @llvm.maxnum.f32(float 5.0, float 42.0) ret float %1 } ; Check that maxnum constant folds to propagate non-NaN or greater argument -; CHECK: define <4 x float> @maxnum_float_vec() { + define <4 x float> @maxnum_float_vec() { - ; CHECK-NEXT: ret <4 x float> - %1 = call <4 x float> @llvm.maxnum.v4f32( - <4 x float> , - <4 x float> - ) +; CHECK-LABEL: @maxnum_float_vec( +; CHECK-NEXT: ret <4 x float> +; + %1 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> , <4 x float> ) ret <4 x float> %1 } ; Check that maxnum constant folds to propagate one of its argument zeros -; CHECK: define <4 x float> @maxnum_float_zeros_vec() { + define <4 x float> @maxnum_float_zeros_vec() { - ; CHECK-NEXT: ret <4 x float> - %1 = call <4 x float> @llvm.maxnum.v4f32( - <4 x float> , - <4 x float> - ) +; CHECK-LABEL: @maxnum_float_zeros_vec( +; CHECK-NEXT: ret <4 x float> +; + %1 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> , <4 x float> ) ret <4 x float> %1 } -; CHECK: define float @minimum_float() { define float @minimum_float() { - ; CHECK-NEXT: ret float 5.000000e+00 +; CHECK-LABEL: @minimum_float( +; CHECK-NEXT: ret float 5.000000e+00 +; %1 = call float @llvm.minimum.f32(float 5.0, float 42.0) ret float %1 } ; Check that minimum propagates its NaN or smaller argument -; CHECK: define <4 x float> @minimum_float_vec() { + define <4 x float> @minimum_float_vec() { - ; CHECK-NEXT: ret <4 x float> - %1 = call <4 x float> @llvm.minimum.v4f32( - <4 x float> , - <4 x float> - ) +; CHECK-LABEL: @minimum_float_vec( +; CHECK-NEXT: ret <4 x float> +; + %1 = call <4 x float> @llvm.minimum.v4f32(<4 x float> , <4 x float> ) ret <4 x float> %1 } ; Check that minimum treats -0.0 as smaller than 0.0 while constant folding -; CHECK: define <4 x float> @minimum_float_zeros_vec() { + define <4 x float> @minimum_float_zeros_vec() { - ; CHECK-NEXT: ret <4 x float> - %1 = call <4 x float> @llvm.minimum.v4f32( - <4 x float> , - <4 x float> - ) +; CHECK-LABEL: @minimum_float_zeros_vec( +; CHECK-NEXT: ret <4 x float> +; + %1 = call <4 x float> @llvm.minimum.v4f32(<4 x float> , <4 x float> ) ret <4 x float> %1 } -; CHECK: define float @maximum_float() { define float @maximum_float() { - ; CHECK-NEXT: ret float 4.200000e+01 +; CHECK-LABEL: @maximum_float( +; CHECK-NEXT: ret float 4.200000e+01 +; %1 = call float @llvm.maximum.f32(float 5.0, float 42.0) ret float %1 } ; Check that maximum propagates its NaN or greater argument -; CHECK: define <4 x float> @maximum_float_vec() { + define <4 x float> @maximum_float_vec() { - ; CHECK-NEXT: ret <4 x float> - %1 = call <4 x float> @llvm.maximum.v4f32( - <4 x float> , - <4 x float> - ) +; CHECK-LABEL: @maximum_float_vec( +; CHECK-NEXT: ret <4 x float> +; + %1 = call <4 x float> @llvm.maximum.v4f32(<4 x float> , <4 x float> ) ret <4 x float> %1 } ; Check that maximum treats -0.0 as smaller than 0.0 while constant folding -; CHECK: define <4 x float> @maximum_float_zeros_vec() { + define <4 x float> @maximum_float_zeros_vec() { - ; CHECK-NEXT: ret <4 x float> - %1 = call <4 x float> @llvm.maximum.v4f32( - <4 x float> , - <4 x float> - ) +; CHECK-LABEL: @maximum_float_zeros_vec( +; CHECK-NEXT: ret <4 x float> +; + %1 = call <4 x float> @llvm.maximum.v4f32(<4 x float> , <4 x float> ) ret <4 x float> %1 } + +define i8 @smax() { +; CHECK-LABEL: @smax( +; CHECK-NEXT: ret i8 -127 +; + %r = call i8 @llvm.smax.i8(i8 128, i8 129) + ret i8 %r +} + +define <5 x i8> @smax_vec() { +; CHECK-LABEL: @smax_vec( +; CHECK-NEXT: ret <5 x i8> +; + %r = call <5 x i8> @llvm.smax.v5i8(<5 x i8> , <5 x i8> ) + ret <5 x i8> %r +} + +define i8 @smin() { +; CHECK-LABEL: @smin( +; CHECK-NEXT: ret i8 -128 +; + %r = call i8 @llvm.smin.i8(i8 128, i8 127) + ret i8 %r +} + +define <5 x i8> @smin_vec() { +; CHECK-LABEL: @smin_vec( +; CHECK-NEXT: ret <5 x i8> +; + %r = call <5 x i8> @llvm.smin.v5i8(<5 x i8> , <5 x i8> ) + ret <5 x i8> %r +} + +define i8 @umax() { +; CHECK-LABEL: @umax( +; CHECK-NEXT: ret i8 -128 +; + %r = call i8 @llvm.umax.i8(i8 128, i8 127) + ret i8 %r +} + +define <5 x i8> @umax_vec() { +; CHECK-LABEL: @umax_vec( +; CHECK-NEXT: ret <5 x i8> +; + %r = call <5 x i8> @llvm.umax.v5i8(<5 x i8> , <5 x i8> ) + ret <5 x i8> %r +} + +define i8 @umin() { +; CHECK-LABEL: @umin( +; CHECK-NEXT: ret i8 127 +; + %r = call i8 @llvm.umin.i8(i8 128, i8 127) + ret i8 %r +} + +define <5 x i8> @umin_vec() { +; CHECK-LABEL: @umin_vec( +; CHECK-NEXT: ret <5 x i8> +; + %r = call <5 x i8> @llvm.umin.v5i8(<5 x i8> , <5 x i8> ) + ret <5 x i8> %r +} diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll index 491e0900e08a2..54029289661c0 100644 --- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll +++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll @@ -1852,38 +1852,38 @@ define i32 @maskedload_extends() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2864u = zext <2 x i8> %loadv2i8 to <2 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4816s = sext <4 x i8> %loadv4i8 to <4 x i16> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4816u = zext <4 x i8> %loadv4i8 to <4 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-V8M-MAIN-RECIP-LABEL: 'maskedload_extends' @@ -1922,15 +1922,15 @@ define i32 @maskedload_extends() { ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32> -; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64> +; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32> -; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64> +; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32> -; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64> +; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64> ; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64> @@ -1974,15 +1974,15 @@ define i32 @maskedload_extends() { ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32> -; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64> +; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32> -; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64> +; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32> -; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64> +; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64> ; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64> @@ -2060,38 +2060,38 @@ define i32 @maskedload_extends() { ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2864u = zext <2 x i8> %loadv2i8 to <2 x i64> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4816s = sext <4 x i8> %loadv4i8 to <4 x i16> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4816u = zext <4 x i8> %loadv4i8 to <4 x i16> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-V8M-MAIN-SIZE-LABEL: 'maskedload_extends' @@ -2361,7 +2361,7 @@ define i32 @maskedstore_trunc() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8832 = trunc <8 x i32> undef to <8 x i8> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v8864 = trunc <8 x i64> undef to <8 x i8> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16816 = trunc <16 x i16> undef to <16 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16832 = trunc <16 x i32> undef to <16 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16832 = trunc <16 x i32> undef to <16 x i8> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v16864 = trunc <16 x i64> undef to <16 x i8> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v21632 = trunc <2 x i32> undef to <2 x i16> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v21664 = trunc <2 x i64> undef to <2 x i16> @@ -2774,9 +2774,9 @@ define i32 @maskedload_fpextends() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 4, <4 x i1> undef, <4 x float> undef) ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 4, <8 x i1> undef, <8 x float> undef) ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v7 = fpext <8 x half> %loadv8f16 to <8 x double> @@ -2866,9 +2866,9 @@ define i32 @maskedload_fpextends() { ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 4, <4 x i1> undef, <4 x float> undef) ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 4, <8 x i1> undef, <8 x float> undef) ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v7 = fpext <8 x half> %loadv8f16 to <8 x double> @@ -2998,9 +2998,9 @@ define i32 @maskedload_fptrunc() { ; CHECK-MVE-RECIP-LABEL: 'maskedload_fptrunc' ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float> @@ -3074,9 +3074,9 @@ define i32 @maskedload_fptrunc() { ; CHECK-MVE-SIZE-LABEL: 'maskedload_fptrunc' ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float> diff --git a/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll b/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll index 3b1bc3b3fdbc0..e5fbf070cf32a 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/unal-vec-ldst.ll @@ -218,42 +218,6 @@ entry: ; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 } -define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 { -entry: - %r = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %r - -; CHECK-LABEL: test_l_qv4float -; CHECK: cost of 2 for instruction: %r = load <4 x float>, <4 x float>* %p, align 4 -} - -define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { -entry: - %r = load <8 x float>, <8 x float>* %p, align 4 - ret <8 x float> %r - -; CHECK-LABEL: test_l_qv8float -; CHECK: cost of 4 for instruction: %r = load <8 x float>, <8 x float>* %p, align 4 -} - -define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { -entry: - %r = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %r - -; CHECK-LABEL: test_l_qv4double -; CHECK: cost of 2 for instruction: %r = load <4 x double>, <4 x double>* %p, align 8 -} - -define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { -entry: - %r = load <8 x double>, <8 x double>* %p, align 8 - ret <8 x double> %r - -; CHECK-LABEL: test_l_qv8double -; CHECK: cost of 4 for instruction: %r = load <8 x double>, <8 x double>* %p, align 8 -} - define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { entry: store <16 x i8> %v, <16 x i8>* %p, align 1 @@ -362,43 +326,6 @@ entry: ; CHECK: cost of 2 for instruction: store <4 x double> %v, <4 x double>* %p, align 8 } -define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 { -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void - -; CHECK-LABEL: test_s_qv4float -; CHECK: cost of 7 for instruction: store <4 x float> %v, <4 x float>* %p, align 4 -} - -define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 { -entry: - store <8 x float> %v, <8 x float>* %p, align 4 - ret void - -; CHECK-LABEL: test_s_qv8float -; CHECK: cost of 15 for instruction: store <8 x float> %v, <8 x float>* %p, align 4 -} - -define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 { -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void - -; CHECK-LABEL: test_s_qv4double -; CHECK: cost of 7 for instruction: store <4 x double> %v, <4 x double>* %p, align 8 -} - -define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 { -entry: - store <8 x double> %v, <8 x double>* %p, align 8 - ret void - -; CHECK-LABEL: test_s_qv8double -; CHECK: cost of 15 for instruction: store <8 x double> %v, <8 x double>* %p, align 8 -} - attributes #0 = { nounwind "target-cpu"="pwr7" } -attributes #1 = { nounwind "target-cpu"="a2q" } attributes #2 = { nounwind "target-cpu"="pwr8" } diff --git a/llvm/test/Analysis/CostModel/X86/abs.ll b/llvm/test/Analysis/CostModel/X86/abs.ll new file mode 100644 index 0000000000000..20b8febc579ae --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/abs.ll @@ -0,0 +1,493 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512dq | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512BW + +; +; abs(X, not_posion) +; + +define void @cost_abs_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { +; SSE2-LABEL: 'cost_abs_i64' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i64' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i64' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'cost_abs_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'cost_abs_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i64' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 0) + %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 0) + %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 0) + %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 0) + ret void +} + +define void @cost_abs_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { +; SSE2-LABEL: 'cost_abs_i32' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i32' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i32' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'cost_abs_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'cost_abs_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i32' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 0) + %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 0) + %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 0) + %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 0) + ret void +} + +define void @cost_abs_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { +; SSE2-LABEL: 'cost_abs_i16' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i16' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i16' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'cost_abs_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'cost_abs_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'cost_abs_i16' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'cost_abs_i16' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'cost_abs_i16' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 0) + %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 0) + %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 0) + %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 0) + ret void +} + +define void @cost_abs_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { +; SSE2-LABEL: 'cost_abs_i8' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i8' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i8' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'cost_abs_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'cost_abs_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'cost_abs_i8' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'cost_abs_i8' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'cost_abs_i8' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 0) + %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 0) + %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 0) + %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 0) + ret void +} + +; +; abs(X, posion) +; + +define void @cost_abs_i64_poison(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { +; SSE2-LABEL: 'cost_abs_i64_poison' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i64_poison' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i64_poison' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'cost_abs_i64_poison' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'cost_abs_i64_poison' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i64_poison' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 -1) + %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 -1) + %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 -1) + %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 -1) + ret void +} + +define void @cost_abs_i32_poison(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { +; SSE2-LABEL: 'cost_abs_i32_poison' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i32_poison' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i32_poison' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'cost_abs_i32_poison' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'cost_abs_i32_poison' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512-LABEL: 'cost_abs_i32_poison' +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 -1) + %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 -1) + %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 -1) + %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 -1) + ret void +} + +define void @cost_abs_i16_poison(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { +; SSE2-LABEL: 'cost_abs_i16_poison' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i16_poison' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i16_poison' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'cost_abs_i16_poison' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'cost_abs_i16_poison' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'cost_abs_i16_poison' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'cost_abs_i16_poison' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'cost_abs_i16_poison' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 -1) + %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 -1) + %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 -1) + %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 -1) + ret void +} + +define void @cost_abs_i8_poison(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { +; SSE2-LABEL: 'cost_abs_i8_poison' +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSSE3-LABEL: 'cost_abs_i8_poison' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SSE42-LABEL: 'cost_abs_i8_poison' +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX1-LABEL: 'cost_abs_i8_poison' +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX2-LABEL: 'cost_abs_i8_poison' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512F-LABEL: 'cost_abs_i8_poison' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512DQ-LABEL: 'cost_abs_i8_poison' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512BW-LABEL: 'cost_abs_i8_poison' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 -1) + %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 -1) + %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 -1) + %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 -1) + ret void +} + +declare i64 @llvm.abs.i64(i64, i1) +declare i32 @llvm.abs.i32(i32, i1) +declare i16 @llvm.abs.i16(i16, i1) +declare i8 @llvm.abs.i8(i8, i1) + +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) + +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) + +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) +declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) diff --git a/llvm/test/Analysis/CostModel/X86/arith-sminmax.ll b/llvm/test/Analysis/CostModel/X86/arith-sminmax.ll new file mode 100644 index 0000000000000..7270a0eced4af --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/arith-sminmax.ll @@ -0,0 +1,401 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512dq | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512BW + +declare i64 @llvm.smax.i64(i64, i64) +declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>) + +declare i32 @llvm.smax.i32(i32, i32) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) + +declare i16 @llvm.smax.i16(i16, i16) +declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>) + +declare i8 @llvm.smax.i8(i8, i8) +declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.smax.v64i8(<64 x i8>, <64 x i8>) + +define i32 @smax(i32 %arg) { +; SSE2-LABEL: 'smax' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'smax' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'smax' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'smax' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'smax' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'smax' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'smax' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'smax' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) + %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) + %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) + %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) + %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +declare i64 @llvm.smin.i64(i64, i64) +declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>) + +declare i32 @llvm.smin.i32(i32, i32) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) + +declare i16 @llvm.smin.i16(i16, i16) +declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>) + +declare i8 @llvm.smin.i8(i8, i8) +declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.smin.v64i8(<64 x i8>, <64 x i8>) + +define i32 @smin(i32 %arg) { +; SSE2-LABEL: 'smin' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'smin' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'smin' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'smin' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'smin' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'smin' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'smin' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'smin' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) + %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) + %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) + %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) + %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} diff --git a/llvm/test/Analysis/CostModel/X86/arith-uminmax.ll b/llvm/test/Analysis/CostModel/X86/arith-uminmax.ll new file mode 100644 index 0000000000000..8b6f6f20e1b40 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/arith-uminmax.ll @@ -0,0 +1,401 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512dq | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512DQ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512BW + +declare i64 @llvm.umax.i64(i64, i64) +declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>) + +declare i32 @llvm.umax.i32(i32, i32) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>) + +declare i16 @llvm.umax.i16(i16, i16) +declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>) + +declare i8 @llvm.umax.i8(i8, i8) +declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>) + +define i32 @umax(i32 %arg) { +; SSE2-LABEL: 'umax' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'umax' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'umax' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'umax' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'umax' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'umax' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'umax' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'umax' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) + %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) + %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) + %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) + %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +declare i64 @llvm.umin.i64(i64, i64) +declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) +declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>) +declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>) + +declare i32 @llvm.umin.i32(i32, i32) +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) + +declare i16 @llvm.umin.i16(i16, i16) +declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>) + +declare i8 @llvm.umin.i8(i8, i8) +declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) +declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>) + +define i32 @umin(i32 %arg) { +; SSE2-LABEL: 'umin' +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSSE3-LABEL: 'umin' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; SSE42-LABEL: 'umin' +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX1-LABEL: 'umin' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'umin' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512F-LABEL: 'umin' +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512DQ-LABEL: 'umin' +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX512BW-LABEL: 'umin' +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) + %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) + %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) + %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) + %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} diff --git a/llvm/test/Analysis/DependenceAnalysis/AA.ll b/llvm/test/Analysis/DependenceAnalysis/AA.ll index ac04378b8a85d..efb5c8d1ef031 100644 --- a/llvm/test/Analysis/DependenceAnalysis/AA.ll +++ b/llvm/test/Analysis/DependenceAnalysis/AA.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -disable-output "-passes=print" \ -; RUN: "-aa-pipeline=basic-aa,type-based-aa" 2>&1 | FileCheck %s +; RUN: "-aa-pipeline=basic-aa,tbaa" 2>&1 | FileCheck %s ; RUN: opt < %s -analyze -basic-aa -tbaa -da | FileCheck %s ; CHECK-LABEL: 'Dependence Analysis' for function 'test_no_noalias' diff --git a/llvm/test/Analysis/LoopAccessAnalysis/nullptr.ll b/llvm/test/Analysis/LoopAccessAnalysis/nullptr.ll index 476cf3208c84c..8fbf47304e800 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/nullptr.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/nullptr.ll @@ -2,9 +2,9 @@ ; RUN: opt -passes='require,require,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s ; Test that the loop accesses are proven safe in this case. -; The analyzer uses to be confused by the "diamond" because GetUnderlyingObjects +; The analyzer uses to be confused by the "diamond" because getUnderlyingObjects ; is saying that the two pointers can both points to null. The loop analyzer -; needs to ignore null in the results returned by GetUnderlyingObjects. +; needs to ignore null in the results returned by getUnderlyingObjects. ; CHECK: Memory dependences are safe with run-time checks diff --git a/llvm/test/Analysis/MemorySSA/phi-translation.ll b/llvm/test/Analysis/MemorySSA/phi-translation.ll index c39ccd31d24b3..93ebc86210d1f 100644 --- a/llvm/test/Analysis/MemorySSA/phi-translation.ll +++ b/llvm/test/Analysis/MemorySSA/phi-translation.ll @@ -287,3 +287,85 @@ the.end: ; preds = %for.main.body ret void } + + +@c = local_unnamed_addr global [2 x i16] zeroinitializer, align 2 + +define i32 @dont_merge_noalias_simple(i32* noalias %ptr) { +; CHECK-LABEL: define i32 @dont_merge_noalias_simple +; CHECK-LABEL: entry: +; CHECK: ; 1 = MemoryDef(liveOnEntry) +; CHECK-NEXT: store i16 1, i16* %s1.ptr, align 2 + +; CHECK-LABEL: %for.body +; CHECK: ; MemoryUse(4) MayAlias +; CHECK-NEXT: %lv = load i16, i16* %arrayidx, align 2 + +entry: + %s1.ptr = getelementptr inbounds [2 x i16], [2 x i16]* @c, i64 0, i64 0 + store i16 1, i16* %s1.ptr, align 2 + br label %for.body + +for.body: ; preds = %for.body, %entry + %storemerge2 = phi i32 [ 1, %entry ], [ %dec, %for.body ] + %idxprom1 = zext i32 %storemerge2 to i64 + %arrayidx = getelementptr inbounds [2 x i16], [2 x i16]* @c, i64 0, i64 %idxprom1 + %lv = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %lv to i32 + store i32 %conv, i32* %ptr, align 4 + %dec = add nsw i32 %storemerge2, -1 + %cmp = icmp sgt i32 %storemerge2, 0 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + %s2.ptr = getelementptr inbounds [2 x i16], [2 x i16]* @c, i64 0, i64 0 + store i16 0, i16* %s2.ptr, align 2 + ret i32 0 +} + + +define i32 @dont_merge_noalias_complex(i32* noalias %ptr, i32* noalias %another) { +; CHECK-LABEL: define i32 @dont_merge_noalias_complex +; CHECK-LABEL: entry: +; CHECK: ; 1 = MemoryDef(liveOnEntry) +; CHECK-NEXT: store i16 1, i16* %s1.ptr, align 2 + +; CHECK-LABEL: %for.body +; CHECK: ; MemoryUse(7) MayAlias +; CHECK-NEXT: %lv = load i16, i16* %arrayidx, align 2 + +entry: + %s1.ptr = getelementptr inbounds [2 x i16], [2 x i16]* @c, i64 0, i64 0 + store i16 1, i16* %s1.ptr, align 2 + br label %for.body + +for.body: ; preds = %for.body, %entry + %storemerge2 = phi i32 [ 1, %entry ], [ %dec, %merge.body ] + %idxprom1 = zext i32 %storemerge2 to i64 + %arrayidx = getelementptr inbounds [2 x i16], [2 x i16]* @c, i64 0, i64 %idxprom1 + %lv = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %lv to i32 + store i32 %conv, i32* %ptr, align 4 + %dec = add nsw i32 %storemerge2, -1 + + %cmpif = icmp sgt i32 %storemerge2, 1 + br i1 %cmpif, label %if.body, label %else.body + +if.body: + store i32 %conv, i32* %another, align 4 + br label %merge.body + +else.body: + store i32 %conv, i32* %another, align 4 + br label %merge.body + +merge.body: + %cmp = icmp sgt i32 %storemerge2, 0 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + %s2.ptr = getelementptr inbounds [2 x i16], [2 x i16]* @c, i64 0, i64 0 + store i16 0, i16* %s2.ptr, align 2 + ret i32 0 +} + diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll index 16a321a8108b2..717d92471406a 100644 --- a/llvm/test/Analysis/MemorySSA/pr39197.ll +++ b/llvm/test/Analysis/MemorySSA/pr39197.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=s390x-linux-gnu -mcpu=z13 -enable-mssa-loop-dependency -verify-memoryssa -sroa -globalopt -functionattrs -simplifycfg -licm -loop-unswitch %s -S | FileCheck %s +; RUN: opt -mtriple=s390x-linux-gnu -mcpu=z13 -enable-mssa-loop-dependency -verify-memoryssa -sroa -globalopt -function-attrs -simplifycfg -licm -loop-unswitch %s -S | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" diff --git a/llvm/test/Analysis/MemorySSA/pr40509.ll b/llvm/test/Analysis/MemorySSA/pr40509.ll index 4827acf1b802b..1dbb6cfba3f61 100644 --- a/llvm/test/Analysis/MemorySSA/pr40509.ll +++ b/llvm/test/Analysis/MemorySSA/pr40509.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -mtriple=systemz-unknown -march=z13 -O3 -enable-mssa-loop-dependency -disable-output %s +; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -O3 -enable-mssa-loop-dependency -disable-output %s ; During transform to LCSSA, an access becomes obfuscated to: ; (2 = phi (phi(val), val)), which BasicAA fails to analyze. diff --git a/llvm/test/Analysis/ScalarEvolution/smax.ll b/llvm/test/Analysis/ScalarEvolution/smax.ll deleted file mode 100644 index 2b2c81c8e90df..0000000000000 --- a/llvm/test/Analysis/ScalarEvolution/smax.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: opt < %s -analyze -enable-new-pm=0 -scalar-evolution | FileCheck %s -; RUN: opt < %s -disable-output "-passes=print" 2>&1 | FileCheck %s -; PR1614 - -; CHECK: --> (%a smax %b) -; CHECK: --> (%a smax %b smax %c) -; CHECK-NOT: smax - -define i32 @x(i32 %a, i32 %b, i32 %c) { - %A = icmp sgt i32 %a, %b - %B = select i1 %A, i32 %a, i32 %b - %C = icmp sle i32 %c, %B - %D = select i1 %C, i32 %B, i32 %c - ret i32 %D -} diff --git a/llvm/test/Analysis/ScalarEvolution/smin-smax-folds.ll b/llvm/test/Analysis/ScalarEvolution/smin-smax-folds.ll new file mode 100644 index 0000000000000..15ec72317d9ab --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/smin-smax-folds.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -analyze -enable-new-pm=0 -scalar-evolution < %s | FileCheck %s +; RUN: opt -disable-output "-passes=print" < %s 2>&1 | FileCheck %s + +; Tests for smin & smax folds. + +; Test case from PR1614. +define i32 @test_PR1614(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: 'test_PR1614' +; CHECK: --> (%a smax %b) +; CHECK: --> (%a smax %b smax %c) +; CHECK-NOT: smax + + %A = icmp sgt i32 %a, %b + %B = select i1 %A, i32 %a, i32 %b + %C = icmp sle i32 %c, %B + %D = select i1 %C, i32 %B, i32 %c + ret i32 %D +} + +declare void @iteration() + +; Test case from PR46939. +; The information from the loop guard can be used to simplify the trip count expression. +define void @smin_simplify_with_guard(i32 %n) { +; CHECK-LABEL: 'smin_simplify_with_guard' +; CHECK-NEXT: Classifying expressions for: @smin_simplify_with_guard +; CHECK-NEXT: %i.011 = phi i32 [ %n, %for.body.lr.ph ], [ %dec, %for.body ] +; CHECK-NEXT: --> {%n,+,-1}<%for.body> U: full-set S: full-set Exits: 0 LoopDispositions: { %for.body: Computable } +; CHECK-NEXT: %dec = add nsw i32 %i.011, -1 +; CHECK-NEXT: --> {(-1 + %n),+,-1}<%for.body> U: full-set S: full-set Exits: -1 LoopDispositions: { %for.body: Computable } +; CHECK-NEXT: Determining loop execution counts for: @smin_simplify_with_guard +; CHECK-NEXT: Loop %for.body: backedge-taken count is %n +; CHECK-NEXT: Loop %for.body: max backedge-taken count is 2147483647 +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is %n +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.body: Trip multiple is 1 +; +entry: + %cmp10 = icmp sgt i32 %n, -1 + br i1 %cmp10, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: + br label %for.body + +for.body: + %i.011 = phi i32 [ %n, %for.body.lr.ph ], [ %dec, %for.body ] + call void @iteration() + %dec = add nsw i32 %i.011, -1 + %cmp = icmp sgt i32 %i.011, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @smin_to_smax(i32 %n) { +; FIXME: ((-1 * (0 smin %n)) + %n) is actually just (0 smax %n) + +; CHECK-LABEL: 'smin_to_smax' +; CHECK-NEXT: Classifying expressions for: @smin_to_smax +; CHECK-NEXT: %i.011 = phi i32 [ %n, %for.body.lr.ph ], [ %dec, %for.body ] +; CHECK-NEXT: --> {%n,+,-1}<%for.body> U: full-set S: full-set Exits: (0 smin %n) LoopDispositions: { %for.body: Computable } +; CHECK-NEXT: %dec = add nsw i32 %i.011, -1 +; CHECK-NEXT: --> {(-1 + %n),+,-1}<%for.body> U: full-set S: full-set Exits: (-1 + (0 smin %n)) LoopDispositions: { %for.body: Computable } +; CHECK-NEXT: Determining loop execution counts for: @smin_to_smax +; CHECK-NEXT: Loop %for.body: backedge-taken count is ((-1 * (0 smin %n)) + %n) +; CHECK-NEXT: Loop %for.body: max backedge-taken count is 2147483647 +; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is ((-1 * (0 smin %n)) + %n) +; CHECK-NEXT: Predicates: +; CHECK: Loop %for.body: Trip multiple is 1 +; +entry: + br label %for.body.lr.ph + +for.body.lr.ph: + br label %for.body + +for.body: + %i.011 = phi i32 [ %n, %for.body.lr.ph ], [ %dec, %for.body ] + call void @iteration() + %dec = add nsw i32 %i.011, -1 + %cmp = icmp sgt i32 %i.011, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; The information from the loop guard can be used to simplify the trip count expression. +define void @smax_simplify_with_guard(i32 %start, i32 %n) { +; CHECK-LABEL: 'smax_simplify_with_guard' +; CHECK-NEXT: Classifying expressions for: @smax_simplify_with_guard +; CHECK-NEXT: %k.0.i26 = phi i32 [ %start, %loop.ph ], [ %inc.i, %loop ] +; CHECK-NEXT: --> {%start,+,1}<%loop> U: full-set S: full-set Exits: (%start smax %n) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %inc.i = add nsw i32 %k.0.i26, 1 +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: full-set S: full-set Exits: (1 + (%start smax %n)) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @smax_simplify_with_guard +; CHECK-NEXT: Loop %loop: backedge-taken count is ((-1 * %start) + (%start smax %n)) +; CHECK-NEXT: Loop %loop: max backedge-taken count is -1 +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((-1 * %start) + (%start smax %n)) +; CHECK-NEXT: Predicates: +; CHECK: Loop %loop: Trip multiple is 1 +entry: + %guard = icmp sge i32 %n, %start + br i1 %guard, label %loop.ph, label %exit + +loop.ph: + br label %loop + +loop: + %k.0.i26 = phi i32 [ %start, %loop.ph ], [ %inc.i, %loop ] + %inc.i = add nsw i32 %k.0.i26, 1 + %cmp26.not.i.not = icmp slt i32 %k.0.i26, %n + br i1 %cmp26.not.i.not, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Analysis/ScalarEvolution/trivial-phis.ll b/llvm/test/Analysis/ScalarEvolution/trivial-phis.ll new file mode 100644 index 0000000000000..3a897e957d1b2 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/trivial-phis.ll @@ -0,0 +1,191 @@ +; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s + +; CHECK-LABEL @test1 +; CHECK %add.lcssa.wide = phi i64 [ %indvars.iv.next, %do.body ] +; CHECK-NEXT --> %add.lcssa.wide U: [1,2147483648) S: [1,2147483648) + +define i64 @test1(i32 signext %n, float* %A) { +entry: + %0 = sext i32 %n to i64 + br label %do.body + +do.body: ; preds = %do.body, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %do.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float 1.000000e+00, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + %add.lcssa.wide = phi i64 [ %indvars.iv.next, %do.body ] + ret i64 %add.lcssa.wide +} + +; CHECK-LABEL @test2 +; CHECK: %tmp24 = phi i64 [ %tmp14, %bb22 ], [ %tmp14, %bb13 ] +; CHECK-NEXT: --> %tmp24 U: full-set S: full-set Exits: <> LoopDispositions: { %bb13: Variant, %bb8: Variant, %bb17: Invariant, %bb27: Invariant } + +define void @test2(i64 %arg, i32* noalias %arg1) { +bb: + %tmp = icmp slt i64 0, %arg + br i1 %tmp, label %bb7, label %bb48 + +bb7: ; preds = %bb + br label %bb8 + +bb8: ; preds = %bb44, %bb7 + %tmp9 = phi i64 [ 0, %bb7 ], [ %tmp45, %bb44 ] + %tmp10 = add nsw i64 %arg, -1 + %tmp11 = icmp slt i64 1, %tmp10 + br i1 %tmp11, label %bb12, label %bb43 + +bb12: ; preds = %bb8 + br label %bb13 + +bb13: ; preds = %bb39, %bb12 + %tmp14 = phi i64 [ 1, %bb12 ], [ %tmp40, %bb39 ] + %tmp15 = icmp slt i64 0, %arg + br i1 %tmp15, label %bb16, label %bb23 + +bb16: ; preds = %bb13 + br label %bb17 + +bb17: ; preds = %bb19, %bb16 + %tmp18 = phi i64 [ 0, %bb16 ], [ %tmp20, %bb19 ] + br label %bb19 + +bb19: ; preds = %bb17 + %tmp20 = add nuw nsw i64 %tmp18, 1 + %tmp21 = icmp slt i64 %tmp20, %arg + br i1 %tmp21, label %bb17, label %bb22 + +bb22: ; preds = %bb19 + br label %bb23 + +bb23: ; preds = %bb22, %bb13 + %tmp24 = phi i64 [ %tmp14, %bb22 ], [ %tmp14, %bb13 ] + %tmp25 = icmp slt i64 0, %arg + br i1 %tmp25, label %bb26, label %bb37 + +bb26: ; preds = %bb23 + br label %bb27 + +bb27: ; preds = %bb33, %bb26 + %tmp28 = phi i64 [ 0, %bb26 ], [ %tmp34, %bb33 ] + %tmp29 = mul nsw i64 %tmp9, %arg + %tmp30 = getelementptr inbounds i32, i32* %arg1, i64 %tmp24 + %tmp31 = getelementptr inbounds i32, i32* %tmp30, i64 %tmp29 + %tmp32 = load i32, i32* %tmp31, align 4 + br label %bb33 + +bb33: ; preds = %bb27 + %tmp34 = add nuw nsw i64 %tmp28, 1 + %tmp35 = icmp slt i64 %tmp34, %arg + br i1 %tmp35, label %bb27, label %bb36 + +bb36: ; preds = %bb33 + br label %bb37 + +bb37: ; preds = %bb36, %bb23 + %tmp38 = phi i64 [ %tmp24, %bb36 ], [ %tmp24, %bb23 ] + br label %bb39 + +bb39: ; preds = %bb37 + %tmp40 = add nuw nsw i64 %tmp38, 1 + %tmp41 = icmp slt i64 %tmp40, %tmp10 + br i1 %tmp41, label %bb13, label %bb42 + +bb42: ; preds = %bb39 + br label %bb43 + +bb43: ; preds = %bb42, %bb8 + br label %bb44 + +bb44: ; preds = %bb43 + %tmp45 = add nuw nsw i64 %tmp9, 1 + %tmp46 = icmp slt i64 %tmp45, %arg + br i1 %tmp46, label %bb8, label %bb47 + +bb47: ; preds = %bb44 + br label %bb48 + +bb48: ; preds = %bb47, %bb + ret void +} + +; CHECK-LABEL @test3 + +; CHECK: %tmp14 = phi i64 [ %tmp40, %bb39 ], [ 1, %bb8 ] +; CHECK-NEXT: --> {1,+,1}<%bb13> U: [1,9223372036854775807) S: [1,9223372036854775807) +; CHECK-SAME: Exits: (-2 + %arg) LoopDispositions: { %bb13: Computable, %bb8: Variant, %bb17_a: Invariant, %bb27: Invariant } +; CHECK: %tmp18 = phi i64 [ %tmp20, %bb17 ], [ 0, %bb13 ] +; CHECK-NEXT: --> {0,+,1}<%bb17_a> U: [0,9223372036854775807) S: [0,9223372036854775807) +; CHECK-SAME: Exits: (-1 + %arg) LoopDispositions: { %bb17_a: Computable, %bb13: Variant, %bb8: Variant } + +; CHECK: %tmp24 = phi i64 [ %tmp14, %bb13 ], [ %tmp14, %bb17 ] +; CHECK-NEXT: --> {1,+,1}<%bb13> U: [1,9223372036854775807) S: [1,9223372036854775807) +; CHECK-SAME: Exits: (-2 + %arg) LoopDispositions: { %bb13: Computable, %bb8: Variant, %bb17_a: Invariant, %bb27: Invariant } +; CHECK: %tmp28 = phi i64 [ %tmp34, %bb27 ], [ 0, %bb23 ] +; CHECK-NEXT: --> {0,+,1}<%bb27> U: [0,9223372036854775807) S: [0,9223372036854775807) +; CHECK-SAME: Exits: (-1 + %arg) LoopDispositions: { %bb27: Computable, %bb13: Variant, %bb8: Variant } + +; CHECK: %tmp38 = phi i64 [ %tmp24, %bb23 ], [ %tmp24, %bb27 ] +; CHECK-NEXT: --> {1,+,1}<%bb13> U: [1,9223372036854775807) S: [1,9223372036854775807) +; CHECK-SAME: Exits: (-2 + %arg) LoopDispositions: { %bb13: Computable, %bb8: Variant, %bb17_a: Invariant, %bb27: Invariant } + +define void @test3(i64 %arg, i32* %arg1) { +bb: + %tmp = icmp slt i64 0, %arg + br i1 %tmp, label %bb8, label %bb48 + +bb8: ; preds = %bb, %bb44 + %tmp9 = phi i64 [ %tmp45, %bb44 ], [ 0, %bb ] + %tmp10 = add nsw i64 %arg, -1 + %tmp11 = icmp slt i64 1, %tmp10 + br i1 %tmp11, label %bb13, label %bb44 + +bb13: ; preds = %bb8, %bb39 + %tmp14 = phi i64 [ %tmp40, %bb39 ], [ 1, %bb8 ] + %tmp15 = icmp slt i64 0, %arg + br i1 %tmp15, label %bb17_a, label %bb23 + +bb17_a: + %tmp18 = phi i64 [ %tmp20, %bb17 ], [ 0, %bb13 ] + %tmp20 = add nuw nsw i64 %tmp18, 1 + + br label %bb17 + +bb17: ; preds = %bb13, %bb17 + %tmp21 = icmp slt i64 %tmp20, %arg + br i1 %tmp21, label %bb17_a, label %bb23 + +bb23: ; preds = %bb17, %bb13 + %tmp24 = phi i64 [ %tmp14, %bb13 ], [ %tmp14, %bb17 ] + %tmp25 = icmp slt i64 0, %arg + br i1 %tmp25, label %bb27, label %bb39 + +bb27: ; preds = %bb23, %bb27 + %tmp28 = phi i64 [ %tmp34, %bb27 ], [ 0, %bb23 ] + %tmp29 = mul nsw i64 %tmp9, %arg + %tmp30 = getelementptr inbounds i32, i32* %arg1, i64 %tmp24 + %tmp31 = getelementptr inbounds i32, i32* %tmp30, i64 %tmp29 + %tmp32 = load i32, i32* %tmp31, align 4 + %tmp34 = add nuw nsw i64 %tmp28, 1 + %tmp35 = icmp slt i64 %tmp34, %arg + br i1 %tmp35, label %bb27, label %bb39 + +bb39: ; preds = %bb23, %bb27 + %tmp38 = phi i64 [ %tmp24, %bb23 ], [ %tmp24, %bb27 ] + %tmp40 = add nuw nsw i64 %tmp38, 1 + %tmp41 = icmp slt i64 %tmp40, %tmp10 + br i1 %tmp41, label %bb13, label %bb44 + +bb44: ; preds = %bb8, %bb39 + %tmp45 = add nuw nsw i64 %tmp9, 1 + %tmp46 = icmp slt i64 %tmp45, %arg + br i1 %tmp46, label %bb8, label %bb48 + +bb48: ; preds = %bb44, %bb + ret void +} diff --git a/llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll b/llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll index 5c66f03b87020..099a7aeb8a76a 100644 --- a/llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/basic-domains.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -basic-aa -scoped-noalias-aa -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/ScopedNoAliasAA/basic.ll b/llvm/test/Analysis/ScopedNoAliasAA/basic.ll index 9ca06a6826bdb..92cc0ccc32d3c 100644 --- a/llvm/test/Analysis/ScopedNoAliasAA/basic.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/basic.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -basic-aa -scoped-noalias-aa -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/ScopedNoAliasAA/basic2.ll b/llvm/test/Analysis/ScopedNoAliasAA/basic2.ll index e0c16b1991157..8275cc0439e00 100644 --- a/llvm/test/Analysis/ScopedNoAliasAA/basic2.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/basic2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -basic-aa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -basic-aa -scoped-noalias-aa -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll index ba893fedee269..fca330a1029f3 100644 --- a/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll +++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -tbaa -basic-aa -functionattrs -S | FileCheck %s +; RUN: opt < %s -tbaa -basic-aa -function-attrs -S | FileCheck %s ; FunctionAttrs should make use of TBAA. diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll index 16cb3b741307f..f25024db4c013 100644 --- a/llvm/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll +++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -tbaa -basic-aa -aa-eval -evaluate-aa-metadata -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s -; RUN: opt < %s -aa-pipeline=type-based-aa,basic-aa -passes=aa-eval -evaluate-aa-metadata -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -aa-pipeline=tbaa,basic-aa -passes=aa-eval -evaluate-aa-metadata -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s ; Generated with "clang -cc1 -disable-llvm-optzns -O1 -emit-llvm" ; #include diff --git a/llvm/test/Analysis/alias-analysis-uses.ll b/llvm/test/Analysis/alias-analysis-uses.ll index 4163ec25584c6..8f13148b20117 100644 --- a/llvm/test/Analysis/alias-analysis-uses.ll +++ b/llvm/test/Analysis/alias-analysis-uses.ll @@ -1,4 +1,4 @@ -; RUN: opt -debug-pass=Executions -globals-aa -functionattrs -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -debug-pass=Executions -globals-aa -function-attrs -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Executing Pass 'Globals Alias Analysis' ; CHECK-NOT: Freeing Pass 'Globals Alias Analysis' diff --git a/llvm/test/Bindings/OCaml/ipo.ml b/llvm/test/Bindings/OCaml/ipo.ml index 6a67f37570edb..c3630ece7bd48 100644 --- a/llvm/test/Bindings/OCaml/ipo.ml +++ b/llvm/test/Bindings/OCaml/ipo.ml @@ -55,7 +55,6 @@ let test_transforms () = ++ add_always_inliner ++ add_global_dce ++ add_global_optimizer - ++ add_ipc_propagation ++ add_prune_eh ++ add_ipsccp ++ add_internalize ~all_but_main:true diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 8ac3e50ea0b74..f555856f34e58 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -244,6 +244,14 @@ define i8 @scalable_call(i8* %addr) #1 { ret i8 %res } +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to translate instruction{{.*}}scalable_alloca +; FALLBACK-WITH-REPORT-OUT-LABEL: scalable_alloca +define void @scalable_alloca() #1 { + %local0 = alloca + load volatile , * %local0 + ret void +} + ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to translate instruction{{.*}}asm_indirect_output ; FALLBACK-WITH-REPORT-OUT-LABEL: asm_indirect_output define void @asm_indirect_output() { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index 8975b2d0bc9ae..11ffacae7b866 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1391,6 +1391,16 @@ define float @test_intrinsic_round(float %a) { ret float %res } +declare i32 @llvm.lrint.i32.f32(float) +define i32 @test_intrinsic_lrint(float %a) { +; CHECK-LABEL: name: test_intrinsic_lrint +; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0 +; CHECK: [[RES:%[0-9]+]]:_(s32) = G_INTRINSIC_LRINT [[A]] +; CHECK: $w0 = COPY [[RES]] + %res = call i32 @llvm.lrint.i32.f32(float %a) + ret i32 %res +} + declare i32 @llvm.ctlz.i32(i32, i1) define i32 @test_ctlz_intrinsic_zero_not_undef(i32 %a) { ; CHECK-LABEL: name: test_ctlz_intrinsic_zero_not_undef diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-inttoptr-ptrtoint.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-inttoptr-ptrtoint.mir new file mode 100644 index 0000000000000..d99b38900383b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-inttoptr-ptrtoint.mir @@ -0,0 +1,33 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +# RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +--- +name: test_combine_inttoptr_same_addressspace +body: | + bb.1: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_combine_inttoptr_same_addressspace + ; CHECK: [[COPY:%[0-9]+]]:_(p64) = COPY $x0 + ; CHECK: $x1 = COPY [[COPY]](p64) + %0:_(p64) = COPY $x0 + %1:_(s64) = G_PTRTOINT %0 + %2:_(p64) = G_INTTOPTR %1 + $x1 = COPY %2 +... +--- +name: test_combine_inttoptr_diff_addressspace +body: | + bb.1: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_combine_inttoptr_diff_addressspace + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY]](p0) + ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p64) = G_INTTOPTR [[PTRTOINT]](s64) + ; CHECK: $x1 = COPY [[INTTOPTR]](p64) + %0:_(p0) = COPY $x0 + %1:_(s64) = G_PTRTOINT %0 + %2:_(p64) = G_INTTOPTR %1 + $x1 = COPY %2 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-ptrtoint.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-ptrtoint.mir new file mode 100644 index 0000000000000..8164b1bf872fc --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-ptrtoint.mir @@ -0,0 +1,17 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +# RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +--- +name: test_combine_ptrtoint +body: | + bb.1: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_combine_ptrtoint + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: $x1 = COPY [[COPY]](s64) + %0:_(s64) = COPY $x0 + %1:_(p64) = G_INTTOPTR %0 + %2:_(s64) = G_PTRTOINT %1 + $x1 = COPY %2 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-convert-fp16-intrinsics.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-convert-fp16-intrinsics.ll new file mode 100644 index 0000000000000..065a3d8e4dd25 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-convert-fp16-intrinsics.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple=aarch64-- -mcpu=falkor -mattr=+lse -O0 -aarch64-enable-atomic-cfg-tidy=0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - | FileCheck %s + +define i16 @convert_to_fp16(float %src) { + ; CHECK-LABEL: name: convert_to_fp16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %cvt = call i16 @llvm.convert.to.fp16.f32(float %src) + ret i16 %cvt +} + +define float @convert_from_fp16(i16 %src) { + ; CHECK-LABEL: name: convert_from_fp16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; CHECK: $s0 = COPY [[FPEXT]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %cvt = call float @llvm.convert.from.fp16.f32(i16 %src) + ret float %cvt +} + +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-fixed-point-intrinsics.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-fixed-point-intrinsics.ll new file mode 100644 index 0000000000000..c70b93e50ded8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-fixed-point-intrinsics.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -stop-after=irtranslator -mtriple=aarch64-- -verify-machineinstrs -o - %s | FileCheck %s + +define i16 @smul_fix(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: smul_fix + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[SMULFIX:%[0-9]+]]:_(s16) = G_SMULFIX [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMULFIX]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.smul.fix.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @umul_fix(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: umul_fix + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[UMULFIX:%[0-9]+]]:_(s16) = G_UMULFIX [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMULFIX]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.umul.fix.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @smul_fix_sat(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: smul_fix_sat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[SMULFIXSAT:%[0-9]+]]:_(s16) = G_SMULFIXSAT [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMULFIXSAT]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.smul.fix.sat.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @umul_fix_sat(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: umul_fix_sat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[UMULFIXSAT:%[0-9]+]]:_(s16) = G_UMULFIXSAT [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMULFIXSAT]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.umul.fix.sat.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @sdiv_fix(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: sdiv_fix + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[SDIVFIX:%[0-9]+]]:_(s16) = G_SDIVFIX [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SDIVFIX]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.sdiv.fix.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @udiv_fix(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: udiv_fix + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[UDIVFIX:%[0-9]+]]:_(s16) = G_UDIVFIX [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UDIVFIX]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.udiv.fix.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @sdiv_fix_sat(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: sdiv_fix_sat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[SDIVFIXSAT:%[0-9]+]]:_(s16) = G_SDIVFIXSAT [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SDIVFIXSAT]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.sdiv.fix.sat.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +define i16 @udiv_fix_sat(i16 %arg0, i16 %arg1) { + ; CHECK-LABEL: name: udiv_fix_sat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0, $w1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[UDIVFIXSAT:%[0-9]+]]:_(s16) = G_UDIVFIXSAT [[TRUNC]], [[TRUNC1]], 7 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UDIVFIXSAT]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %res = call i16 @llvm.udiv.fix.sat.i16(i16 %arg0, i16 %arg1, i32 7) + ret i16 %res +} + +declare i16 @llvm.smul.fix.i16(i16, i16, i32 immarg) #0 +declare i16 @llvm.umul.fix.i16(i16, i16, i32 immarg) #0 +declare i16 @llvm.smul.fix.sat.i16(i16, i16, i32 immarg) #0 +declare i16 @llvm.umul.fix.sat.i16(i16, i16, i32 immarg) #0 +declare i16 @llvm.sdiv.fix.i16(i16, i16, i32 immarg) #1 +declare i16 @llvm.udiv.fix.i16(i16, i16, i32 immarg) #1 +declare i16 @llvm.sdiv.fix.sat.i16(i16, i16, i32 immarg) #1 +declare i16 @llvm.udiv.fix.sat.i16(i16, i16, i32 immarg) #1 + +attributes #0 = { nounwind readnone speculatable willreturn } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-no-op-intrinsics.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-no-op-intrinsics.ll new file mode 100644 index 0000000000000..dc243b74375d1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-no-op-intrinsics.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -O0 -mtriple=aarch64-- -stop-after=irtranslator -verify-machineinstrs -o - %s | FileCheck %s + +define i64 @expect_i64(i64 %arg0) { + ; CHECK-LABEL: name: expect_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; CHECK: $x0 = COPY [[COPY1]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %expval = call i64 @llvm.expect.i64(i64 %arg0, i64 1) + ret i64 %expval +} + +define i8* @ptr_annotate(i8* %arg0, i8* %arg1, i8* %arg2, i32 %arg3) { + ; CHECK-LABEL: name: ptr_annotate + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w3, $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY $x2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3 + ; CHECK: [[COPY4:%[0-9]+]]:_(p0) = COPY [[COPY]](p0) + ; CHECK: $x0 = COPY [[COPY4]](p0) + ; CHECK: RET_ReallyLR implicit $x0 + %call = call i8* @llvm.ptr.annotation.p0i8(i8* %arg0, i8* %arg1, i8* %arg2, i32 %arg3) + ret i8* %call +} + +@.str = private unnamed_addr constant [4 x i8] c"sth\00", section "llvm.metadata" +@.str1 = private unnamed_addr constant [4 x i8] c"t.c\00", section "llvm.metadata" + +define i32 @annotation(i32 %a) { + ; CHECK-LABEL: name: annotation + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: $w0 = COPY [[COPY1]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %call = call i32 @llvm.annotation.i32(i32 %a, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str1, i32 0, i32 0), i32 2) + ret i32 %call +} + +define i8* @launder_invariant_group(i8* %p) { + ; CHECK-LABEL: name: launder_invariant_group + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0) + ; CHECK: $x0 = COPY [[COPY1]](p0) + ; CHECK: RET_ReallyLR implicit $x0 + %q = call i8* @llvm.launder.invariant.group.p0i8(i8* %p) + ret i8* %q +} + +define i8* @strip_invariant_group(i8* %p) { + ; CHECK-LABEL: name: strip_invariant_group + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0) + ; CHECK: $x0 = COPY [[COPY1]](p0) + ; CHECK: RET_ReallyLR implicit $x0 + %q = call i8* @llvm.strip.invariant.group.p0i8(i8* %p) + ret i8* %q +} + +declare i64 @llvm.expect.i64(i64, i64) #0 +declare i8* @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32) #1 +declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1 +declare i8* @llvm.launder.invariant.group.p0i8(i8*) #2 +declare i8* @llvm.strip.invariant.group.p0i8(i8*) #3 + +attributes #0 = { nounwind readnone willreturn } +attributes #1 = { nounwind willreturn } +attributes #2 = { inaccessiblememonly nounwind speculatable willreturn } +attributes #3 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir index f96e2e65ce8f4..9417df066a46b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir @@ -39,9 +39,8 @@ body: | ; CHECK-LABEL: name: test_freeze_v4s64 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY [[DEF]](<2 x s64>) ; CHECK: [[FREEZE:%[0-9]+]]:_(<2 x s64>) = G_FREEZE [[DEF]] - ; CHECK: [[FREEZE1:%[0-9]+]]:_(<2 x s64>) = G_FREEZE [[COPY]] + ; CHECK: [[FREEZE1:%[0-9]+]]:_(<2 x s64>) = G_FREEZE [[DEF]] ; CHECK: $q0 = COPY [[FREEZE]](<2 x s64>) ; CHECK: $q1 = COPY [[FREEZE1]](<2 x s64>) %undef:_(<4 x s64>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir index e0a20e030710e..2af37dff5fd49 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir @@ -24,6 +24,9 @@ define void @test_sitofp_v4s32() { ret void } define void @test_uitofp_v4s32() { ret void } + define void @test_uitofp_v2s64_v2i1() { ret void } + define void @test_sitofp_v2s64_v2i1() { ret void } + define void @test_sitofp_s32_s16() { ret void } define void @test_uitofp_s32_s16() { ret void } ... @@ -239,6 +242,38 @@ body: | $q0 = COPY %1 ... +--- +name: test_uitofp_v2s64_v2i1 +body: | + bb.0: + liveins: $q0 + ; CHECK-LABEL: name: test_uitofp_v2s64_v2i1 + ; CHECK: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s1>) = G_BUILD_VECTOR [[DEF]](s1), [[DEF]](s1) + ; CHECK: [[ZEXT:%[0-9]+]]:_(<2 x s64>) = G_ZEXT [[BUILD_VECTOR]](<2 x s1>) + ; CHECK: [[UITOFP:%[0-9]+]]:_(<2 x s64>) = G_UITOFP [[ZEXT]](<2 x s64>) + ; CHECK: $q0 = COPY [[UITOFP]](<2 x s64>) + %0:_(<2 x s1>) = G_IMPLICIT_DEF + %1:_(<2 x s64>) = G_UITOFP %0 + $q0 = COPY %1 +... + +--- +name: test_sitofp_v2s64_v2i1 +body: | + bb.0: + liveins: $q0 + ; CHECK-LABEL: name: test_sitofp_v2s64_v2i1 + ; CHECK: [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s1>) = G_BUILD_VECTOR [[DEF]](s1), [[DEF]](s1) + ; CHECK: [[SEXT:%[0-9]+]]:_(<2 x s64>) = G_SEXT [[BUILD_VECTOR]](<2 x s1>) + ; CHECK: [[SITOFP:%[0-9]+]]:_(<2 x s64>) = G_SITOFP [[SEXT]](<2 x s64>) + ; CHECK: $q0 = COPY [[SITOFP]](<2 x s64>) + %0:_(<2 x s1>) = G_IMPLICIT_DEF + %1:_(<2 x s64>) = G_SITOFP %0 + $q0 = COPY %1 +... + --- name: test_sitofp_s32_s16 body: | diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-lrint.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-lrint.mir new file mode 100644 index 0000000000000..ddc4f07e1a146 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-lrint.mir @@ -0,0 +1,94 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-unknown -run-pass=legalizer %s -o - | FileCheck %s +--- +name: testmsws +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$s0' } +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: testmsws + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK: [[INTRINSIC_LRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LRINT [[COPY]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[INTRINSIC_LRINT]](s64) + ; CHECK: $w0 = COPY [[TRUNC]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $s0 + %1:_(s64) = G_INTRINSIC_LRINT %0(s32) + %2:_(s32) = G_TRUNC %1(s64) + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: testmsxs +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$s0' } +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: testmsxs + ; CHECK: liveins: $s0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK: [[INTRINSIC_LRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LRINT [[COPY]](s32) + ; CHECK: $x0 = COPY [[INTRINSIC_LRINT]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %0:_(s32) = COPY $s0 + %1:_(s64) = G_INTRINSIC_LRINT %0(s32) + $x0 = COPY %1(s64) + RET_ReallyLR implicit $x0 + +... +--- +name: testmswd +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$d0' } +body: | + bb.1: + liveins: $d0 + + ; CHECK-LABEL: name: testmswd + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK: [[INTRINSIC_LRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LRINT [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[INTRINSIC_LRINT]](s64) + ; CHECK: $w0 = COPY [[TRUNC]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s64) = COPY $d0 + %1:_(s64) = G_INTRINSIC_LRINT %0(s64) + %2:_(s32) = G_TRUNC %1(s64) + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: testmsxd +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$d0' } +body: | + bb.1: + liveins: $d0 + + ; CHECK-LABEL: name: testmsxd + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK: [[INTRINSIC_LRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LRINT [[COPY]](s64) + ; CHECK: $x0 = COPY [[INTRINSIC_LRINT]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $d0 + %1:_(s64) = G_INTRINSIC_LRINT %0(s64) + $x0 = COPY %1(s64) + RET_ReallyLR implicit $x0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir index 7b4ae3d56ab0f..22d9406ac4f6f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir @@ -43,6 +43,50 @@ body: | $q0 = COPY %2(<2 x s64>) RET_ReallyLR implicit $q0 +... +--- +name: shuffle_v16i8 +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_v16i8 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1 + ; CHECK: [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[COPY]](<16 x s8>), [[COPY1]], shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + ; CHECK: $q0 = COPY [[SHUF]](<16 x s8>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<16 x s8>) = COPY $q0 + %1:_(<16 x s8>) = COPY $q1 + %2:_(<16 x s8>) = G_SHUFFLE_VECTOR %0(<16 x s8>), %1, shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + $q0 = COPY %2(<16 x s8>) + RET_ReallyLR implicit $q0 + +... +--- +name: shuffle_v8i16 +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: shuffle_v8i16 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1 + ; CHECK: [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<8 x s16>), [[COPY1]], shufflemask(0, 0, 0, 0, 0, 0, 0, 0) + ; CHECK: $q0 = COPY [[SHUF]](<8 x s16>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<8 x s16>) = COPY $q0 + %1:_(<8 x s16>) = COPY $q1 + %2:_(<8 x s16>) = G_SHUFFLE_VECTOR %0(<8 x s16>), %1, shufflemask(0, 0, 0, 0, 0, 0, 0, 0) + $q0 = COPY %2(<8 x s16>) + RET_ReallyLR implicit $q0 + ... --- name: shuffle_1elt_mask diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir index 2e7c1cbb33914..228e3a44b0c50 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir @@ -54,9 +54,8 @@ body: | ; CHECK-LABEL: name: test_implicit_def_v4s64 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY [[DEF]](<2 x s64>) ; CHECK: $q0 = COPY [[DEF]](<2 x s64>) - ; CHECK: $q1 = COPY [[COPY]](<2 x s64>) + ; CHECK: $q1 = COPY [[DEF]](<2 x s64>) %0:_(<4 x s64>) = G_IMPLICIT_DEF %1:_(<2 x s64> ), %2:_(<2 x s64>) = G_UNMERGE_VALUES %0 $q0 = COPY %1 @@ -77,3 +76,25 @@ body: | $w0 = COPY %1 $w1 = COPY %2 ... +--- +name: test_implicit_def_v16s8 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v16s8 + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s8>) = G_IMPLICIT_DEF + ; CHECK: $q0 = COPY [[DEF]](<16 x s8>) + %0:_(<16 x s8>) = G_IMPLICIT_DEF + $q0 = COPY %0 +... +--- +name: test_implicit_def_v8s16 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v8s16 + ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF + ; CHECK: $q0 = COPY [[DEF]](<8 x s16>) + %0:_(<8 x s16>) = G_IMPLICIT_DEF + $q0 = COPY %0 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index dcd310fba7e9e..110c57408c056 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -129,6 +129,13 @@ # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined @@ -337,6 +344,30 @@ # DEBUG-NEXT: G_SSUBSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SMULFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UMULFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SDIVFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UDIVFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SDIVFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UDIVFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK @@ -407,12 +438,12 @@ # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_SITOFP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -# DEBUG-NEXT: .. the first uncovered type index: 2, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_UITOFP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} -# DEBUG-NEXT: .. the first uncovered type index: 2, OK -# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_FABS (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-to-duplane.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-to-duplane.mir new file mode 100644 index 0000000000000..5743c489fa206 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-to-duplane.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +... +--- +name: duplane_v16i8 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: duplane_v16i8 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[DUPv16i8lane:%[0-9]+]]:fpr128 = DUPv16i8lane [[COPY]], 0 + ; CHECK: $q0 = COPY [[DUPv16i8lane]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<16 x s8>) = COPY $q0 + %2:fpr(<16 x s8>) = G_IMPLICIT_DEF + %1:fpr(<16 x s8>) = G_SHUFFLE_VECTOR %0(<16 x s8>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + $q0 = COPY %1(<16 x s8>) + RET_ReallyLR implicit $q0 + +... +--- +name: duplane_v8i16 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: duplane_v8i16 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[DUPv8i16lane:%[0-9]+]]:fpr128 = DUPv8i16lane [[COPY]], 0 + ; CHECK: $q0 = COPY [[DUPv8i16lane]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<8 x s16>) = COPY $q0 + %2:fpr(<8 x s16>) = G_IMPLICIT_DEF + %1:fpr(<8 x s16>) = G_SHUFFLE_VECTOR %0(<8 x s16>), %2, shufflemask(0, 0, 0, 0, 0, 0, 0, 0) + $q0 = COPY %1(<8 x s16>) + RET_ReallyLR implicit $q0 + +... +--- +name: duplane_v4f32 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: duplane_v4f32 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[DUPv4i32lane:%[0-9]+]]:fpr128 = DUPv4i32lane [[COPY]], 0 + ; CHECK: $q0 = COPY [[DUPv4i32lane]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<4 x s32>) = COPY $q0 + %2:fpr(<4 x s32>) = G_IMPLICIT_DEF + %1:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %2, shufflemask(0, 0, 0, 0) + $q0 = COPY %1(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: duplane_v2i64 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: duplane_v2i64 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[DUPv2i64lane:%[0-9]+]]:fpr128 = DUPv2i64lane [[COPY]], 0 + ; CHECK: $q0 = COPY [[DUPv2i64lane]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<2 x s64>) = COPY $q0 + %2:fpr(<2 x s64>) = G_IMPLICIT_DEF + %1:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %2, shufflemask(0, 0) + $q0 = COPY %1(<2 x s64>) + RET_ReallyLR implicit $q0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir index c66d334d16b0b..345cc774f9963 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir @@ -140,7 +140,7 @@ body: | ; CHECK-LABEL: name: shuffle_v2i64 ; CHECK: constants: - ; CHECK: value: '<16 x i8> ' + ; CHECK: value: '<16 x i8> ' ; CHECK: alignment: 16 ; CHECK: isTargetSpecific: false ; CHECK: liveins: $q0, $q1 @@ -154,7 +154,7 @@ body: | ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(<2 x s64>) = COPY $q0 %1:fpr(<2 x s64>) = COPY $q1 - %2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, shufflemask(0, 0) + %2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, shufflemask(1, 0) $q0 = COPY %2(<2 x s64>) RET_ReallyLR implicit $q0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir index b72b8e5d0d700..db355dfc151f5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir @@ -11,6 +11,9 @@ define void @store_zero_s64_gpr(i64* %addr) { ret void } define void @store_zero_s32_gpr(i32* %addr) { ret void } + define void @store_zero_s16(i32* %addr) { ret void } + define void @store_zero_s8(i32* %addr) { ret void } + define void @store_zero_look_through_cst(i32* %addr) { ret void } define void @store_fi_s64_gpr() { %ptr0 = alloca i64 @@ -176,6 +179,53 @@ body: | ... +--- +name: store_zero_s16 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_zero_s16 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: STRHHui $wzr, [[COPY]], 0 :: (store 2) + %0:gpr(p0) = COPY $x0 + %1:gpr(s16) = G_CONSTANT i16 0 + G_STORE %1(s16), %0(p0) :: (store 2) + +... + +--- +name: store_zero_s8 +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_zero_s8 + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: STRBBui $wzr, [[COPY]], 0 :: (store 1) + %0:gpr(p0) = COPY $x0 + %1:gpr(s8) = G_CONSTANT i8 0 + G_STORE %1(s8), %0(p0) :: (store 1) +... + +--- +name: store_zero_look_through_cst +legalized: true +regBankSelected: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_zero_look_through_cst + ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0 + ; CHECK: STRXui $xzr, [[COPY]], 0 :: (store 8 into %ir.addr) + %0:gpr(p0) = COPY $x0 + %1:gpr(s32) = G_CONSTANT i32 0 + %2:gpr(s64) = G_ZEXT %1 + G_STORE %2, %0 :: (store 8 into %ir.addr) +... + --- name: store_fi_s64_gpr legalized: true diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/xro-addressing-mode-constant.mir b/llvm/test/CodeGen/AArch64/GlobalISel/xro-addressing-mode-constant.mir new file mode 100644 index 0000000000000..7b2dae2f7e505 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/xro-addressing-mode-constant.mir @@ -0,0 +1,211 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# +# Test using the xro addressing mode with immediates. This should be done for +# wide constants which are preferably selected using a mov rather than an add. + +... +--- +name: use_xro_cannot_encode_add_lsl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; Check that we use the XRO addressing mode when the constant cannot be + ; represented using an add + lsl. + ; + ; cst = 0000000111000000 + ; cst & 000fffffff000000 != 0 + ; + ; CHECK-LABEL: name: use_xro_cannot_encode_add_lsl + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64sp = COPY $x0 + ; CHECK: %cst:gpr64 = MOVi64imm 4580179968 + ; CHECK: %load:gpr64 = LDRXroX %copy, %cst, 0, 0 :: (volatile load 8) + ; CHECK: RET_ReallyLR + %copy:gpr(p0) = COPY $x0 + %cst:gpr(s64) = G_CONSTANT i64 4580179968 + %addr:gpr(p0) = G_PTR_ADD %copy, %cst(s64) + %load:gpr(s64) = G_LOAD %addr(p0) :: (volatile load 8) + RET_ReallyLR + +... +--- +name: use_xro_preferred_mov +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; Check that we use the XRO addressing mode when the constant can be + ; represented using a single movk. + ; + ; cst = 000000000000f000 + ; cst & 000fffffff000000 == 0 + ; cst & ffffffffffff0fff != 0 + ; + ; CHECK-LABEL: name: use_xro_preferred_mov + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64sp = COPY $x0 + ; CHECK: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 61440 + ; CHECK: %cst:gpr64 = SUBREG_TO_REG 0, [[MOVi32imm]], %subreg.sub_32 + ; CHECK: %load:gpr64 = LDRXroX %copy, %cst, 0, 0 :: (volatile load 8) + ; CHECK: RET_ReallyLR + %copy:gpr(p0) = COPY $x0 + %cst:gpr(s64) = G_CONSTANT i64 61440 + %addr:gpr(p0) = G_PTR_ADD %copy, %cst(s64) + %load:gpr(s64) = G_LOAD %addr(p0) :: (volatile load 8) + RET_ReallyLR + +... +--- +name: use_xro_negative_imm +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; Check that this works even if we have a negative immediate. + ; + ; CHECK-LABEL: name: use_xro_negative_imm + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64sp = COPY $x0 + ; CHECK: %cst:gpr64 = MOVi64imm -61440 + ; CHECK: %load:gpr64 = LDRXroX %copy, %cst, 0, 0 :: (volatile load 8) + ; CHECK: RET_ReallyLR + %copy:gpr(p0) = COPY $x0 + %cst:gpr(s64) = G_CONSTANT i64 -61440 + %addr:gpr(p0) = G_PTR_ADD %copy, %cst(s64) + %load:gpr(s64) = G_LOAD %addr(p0) :: (volatile load 8) + RET_ReallyLR + +... +--- +name: dont_use_xro_selectable_imm +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; Immediates that can be encoded in a LDRXui should be skipped. + ; + ; CHECK-LABEL: name: dont_use_xro_selectable_imm + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64sp = COPY $x0 + ; CHECK: %load:gpr64 = LDRXui %copy, 2 :: (volatile load 8) + ; CHECK: RET_ReallyLR + %copy:gpr(p0) = COPY $x0 + %cst:gpr(s64) = G_CONSTANT i64 16 + %addr:gpr(p0) = G_PTR_ADD %copy, %cst(s64) + %load:gpr(s64) = G_LOAD %addr(p0) :: (volatile load 8) + RET_ReallyLR + +... +--- +name: dont_use_xro_selectable_negative_imm +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; Immediates that can be encoded in a LDRXui should be skipped. + ; + ; CHECK-LABEL: name: dont_use_xro_selectable_negative_imm + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64sp = COPY $x0 + ; CHECK: %load:gpr64 = LDURXi %copy, -16 :: (volatile load 8) + ; CHECK: RET_ReallyLR + %copy:gpr(p0) = COPY $x0 + %cst:gpr(s64) = G_CONSTANT i64 -16 + %addr:gpr(p0) = G_PTR_ADD %copy, %cst(s64) + %load:gpr(s64) = G_LOAD %addr(p0) :: (volatile load 8) + RET_ReallyLR + +... +--- +name: dont_use_xro_zero +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; Immediates that can be encoded in a LDRXui should be skipped. + ; + ; CHECK-LABEL: name: dont_use_xro_zero + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64sp = COPY $x0 + ; CHECK: %load:gpr64 = LDRXui %copy, 0 :: (volatile load 8) + ; CHECK: RET_ReallyLR + %copy:gpr(p0) = COPY $x0 + %cst:gpr(s64) = G_CONSTANT i64 0 + %addr:gpr(p0) = G_PTR_ADD %copy, %cst(s64) + %load:gpr(s64) = G_LOAD %addr(p0) :: (volatile load 8) + RET_ReallyLR + +... +--- +name: dont_use_xro_in_range +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; Check that we skip constants which can be encoded in an add. + ; 17 is in [0x0, 0xfff] + ; + ; CHECK-LABEL: name: dont_use_xro_in_range + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64sp = COPY $x0 + ; CHECK: %load:gpr64 = LDURXi %copy, 17 :: (volatile load 8) + ; CHECK: RET_ReallyLR + %copy:gpr(p0) = COPY $x0 + %cst:gpr(s64) = G_CONSTANT i64 17 + %addr:gpr(p0) = G_PTR_ADD %copy, %cst(s64) + %load:gpr(s64) = G_LOAD %addr(p0) :: (volatile load 8) + RET_ReallyLR + +... +--- +name: dont_use_xro_add_lsl +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; Check that we skip when we have an add with an lsl which cannot be + ; represented as a movk. + ; + ; cst = 0x0000000000111000 + ; cst & 000fffffff000000 = 0 + ; cst & ffffffffff00ffff != 0 + ; cst & ffffffffffff0fff != 0 + ; + ; CHECK-LABEL: name: dont_use_xro_add_lsl + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64 = COPY $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY %copy + ; CHECK: %addr:gpr64sp = ADDXri [[COPY]], 273, 12 + ; CHECK: %load:gpr64 = LDRXui %addr, 0 :: (volatile load 8) + ; CHECK: RET_ReallyLR + %copy:gpr(p0) = COPY $x0 + %cst:gpr(s64) = G_CONSTANT i64 1118208 + %addr:gpr(p0) = G_PTR_ADD %copy, %cst(s64) + %load:gpr(s64) = G_LOAD %addr(p0) :: (volatile load 8) + RET_ReallyLR diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll index 5f75b4ef944bb..bfe7e4941da8b 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -147,3 +147,50 @@ entry: ret i64 %v } +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: stp_i64_with_ld:%bb.0 +; CHECK:Cluster ld/st SU(5) - SU(10) +; CHECK:Cluster ld/st SU(15) - SU(20) +; CHECK:SU(5): STRXui %7:gpr64, %0:gpr64common, 0 :: +; CHECK:SU(10): STRXui %12:gpr64, %0:gpr64common, 1 :: +; CHECK:SU(15): STRXui %17:gpr64, %0:gpr64common, 2 :: +; CHECK:SU(20): STRXui %22:gpr64, %0:gpr64common, 3 :: +define void @stp_i64_with_ld(i64* noalias nocapture %a, i64* noalias nocapture readnone %b, i64* noalias nocapture readnone %c) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 8 + %0 = load i64, i64* %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 16 + %1 = load i64, i64* %arrayidx3, align 8 + %mul = mul nsw i64 %1, %0 + %2 = load i64, i64* %a, align 8 + %add6 = add nsw i64 %2, %mul + store i64 %add6, i64* %a, align 8 + %arrayidx.1 = getelementptr inbounds i64, i64* %a, i64 9 + %3 = load i64, i64* %arrayidx.1, align 8 + %arrayidx3.1 = getelementptr inbounds i64, i64* %a, i64 17 + %4 = load i64, i64* %arrayidx3.1, align 8 + %mul.1 = mul nsw i64 %4, %3 + %arrayidx5.1 = getelementptr inbounds i64, i64* %a, i64 1 + %5 = load i64, i64* %arrayidx5.1, align 8 + %add6.1 = add nsw i64 %5, %mul.1 + store i64 %add6.1, i64* %arrayidx5.1, align 8 + %arrayidx.2 = getelementptr inbounds i64, i64* %a, i64 10 + %6 = load i64, i64* %arrayidx.2, align 8 + %arrayidx3.2 = getelementptr inbounds i64, i64* %a, i64 18 + %7 = load i64, i64* %arrayidx3.2, align 8 + %mul.2 = mul nsw i64 %7, %6 + %arrayidx5.2 = getelementptr inbounds i64, i64* %a, i64 2 + %8 = load i64, i64* %arrayidx5.2, align 8 + %add6.2 = add nsw i64 %8, %mul.2 + store i64 %add6.2, i64* %arrayidx5.2, align 8 + %arrayidx.3 = getelementptr inbounds i64, i64* %a, i64 11 + %9 = load i64, i64* %arrayidx.3, align 8 + %arrayidx3.3 = getelementptr inbounds i64, i64* %a, i64 19 + %10 = load i64, i64* %arrayidx3.3, align 8 + %mul.3 = mul nsw i64 %10, %9 + %arrayidx5.3 = getelementptr inbounds i64, i64* %a, i64 3 + %11 = load i64, i64* %arrayidx5.3, align 8 + %add6.3 = add nsw i64 %11, %mul.3 + store i64 %add6.3, i64* %arrayidx5.3, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll index 0a6c7a14a3352..5f76f0a1c2714 100644 --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -426,11 +426,10 @@ define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind { ; CHECK-NEXT: rev32.16b v0, v0 ; CHECK-NEXT: ret ; -; FALLBACK-LABEL: test_vrev32Q8: -; FALLBACK: // %bb.0: -; FALLBACK-NEXT: ldr q0, [x0] -; FALLBACK-NEXT: rev32.16b v0, v0 -; FALLBACK-NEXT: ret +; GISEL-LABEL: test_vrev32Q8: +; GISEL: // %bb.0: +; GISEL: tbl.16b v0, { v0, v1 }, v2 +; GISEL: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 @@ -443,11 +442,10 @@ define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind { ; CHECK-NEXT: rev32.8h v0, v0 ; CHECK-NEXT: ret ; -; FALLBACK-LABEL: test_vrev32Q16: -; FALLBACK: // %bb.0: -; FALLBACK-NEXT: ldr q0, [x0] -; FALLBACK-NEXT: rev32.8h v0, v0 -; FALLBACK-NEXT: ret +; GISEL-LABEL: test_vrev32Q16: +; GISEL: // %bb.0: +; GISEL: tbl.16b v0, { v0, v1 }, v2 +; GISEL: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 @@ -477,11 +475,10 @@ define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind { ; CHECK-NEXT: rev16.16b v0, v0 ; CHECK-NEXT: ret ; -; FALLBACK-LABEL: test_vrev16Q8: -; FALLBACK: // %bb.0: -; FALLBACK-NEXT: ldr q0, [x0] -; FALLBACK-NEXT: rev16.16b v0, v0 -; FALLBACK-NEXT: ret +; GISEL-LABEL: test_vrev16Q8: +; GISEL: // %bb.0: +; GISEL: tbl.16b v0, { v0, v1 }, v2 +; GISEL: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp2 @@ -513,11 +510,10 @@ define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind { ; CHECK-NEXT: rev32.8h v0, v0 ; CHECK-NEXT: ret ; -; FALLBACK-LABEL: test_vrev32Q16_undef: -; FALLBACK: // %bb.0: -; FALLBACK-NEXT: ldr q0, [x0] -; FALLBACK-NEXT: rev32.8h v0, v0 -; FALLBACK-NEXT: ret +; GISEL-LABEL: test_vrev32Q16_undef: +; GISEL: // %bb.0: +; GISEL: tbl.16b v0, { v0, v1 }, v2 +; GISEL: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp2 diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll index d236aeaf32a70..9ab7247677070 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll @@ -30,9 +30,19 @@ define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +define <1 x i64> @fcvtas_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtas_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtas d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtau_2s: @@ -61,9 +71,19 @@ define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +define <1 x i64> @fcvtau_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtau_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtau d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtms_2s: @@ -92,9 +112,19 @@ define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +define <1 x i64> @fcvtms_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtms_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtms d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtmu_2s: @@ -123,9 +153,19 @@ define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +define <1 x i64> @fcvtmu_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtmu_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtmu d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtps_2s: @@ -154,9 +194,19 @@ define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +define <1 x i64> @fcvtps_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtps_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtps d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtpu_2s: @@ -185,9 +235,19 @@ define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +define <1 x i64> @fcvtpu_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtpu_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtpu d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtns_2s: @@ -216,9 +276,19 @@ define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +define <1 x i64> @fcvtns_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtns_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtns d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtnu_2s: @@ -247,9 +317,19 @@ define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +define <1 x i64> @fcvtnu_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtnu_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtnu d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtzs_2s: @@ -278,6 +358,57 @@ define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +; FIXME: Generate "fcvtzs d0, d0"? +define <1 x i64> @fcvtzs_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtzs_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtzs x8, d0 +;CHECK-NEXT: mov d0, x8 +;CHECK-NEXT: ret + %tmp3 = fptosi <1 x double> %A to <1 x i64> + ret <1 x i64> %tmp3 +} + +define <2 x i32> @fcvtzs_2s_intrinsic(<2 x float> %A) nounwind { +;CHECK-LABEL: fcvtzs_2s_intrinsic: +;CHECK-NOT: ld1 +;CHECK: fcvtzs.2s v0, v0 +;CHECK-NEXT: ret + %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> %A) + ret <2 x i32> %tmp3 +} + +define <4 x i32> @fcvtzs_4s_intrinsic(<4 x float> %A) nounwind { +;CHECK-LABEL: fcvtzs_4s_intrinsic: +;CHECK-NOT: ld1 +;CHECK: fcvtzs.4s v0, v0 +;CHECK-NEXT: ret + %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %A) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @fcvtzs_2d_intrinsic(<2 x double> %A) nounwind { +;CHECK-LABEL: fcvtzs_2d_intrinsic: +;CHECK-NOT: ld1 +;CHECK: fcvtzs.2d v0, v0 +;CHECK-NEXT: ret + %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %A) + ret <2 x i64> %tmp3 +} + +define <1 x i64> @fcvtzs_1d_intrinsic(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtzs_1d_intrinsic: +;CHECK-NOT: ld1 +;CHECK: fcvtzs d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + +declare <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float>) nounwind readnone +declare <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float>) nounwind readnone +declare <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: fcvtzu_2s: @@ -306,6 +437,58 @@ define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind { ret <2 x i64> %tmp3 } +; FIXME: Generate "fcvtzu d0, d0"? +define <1 x i64> @fcvtzu_1d(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtzu_1d: +;CHECK-NOT: ld1 +;CHECK: fcvtzu x8, d0 +;CHECK-NEXT: mov d0, x8 +;CHECK-NEXT: ret + %tmp3 = fptoui <1 x double> %A to <1 x i64> + ret <1 x i64> %tmp3 +} + +define <2 x i32> @fcvtzu_2s_intrinsic(<2 x float> %A) nounwind { +;CHECK-LABEL: fcvtzu_2s_intrinsic: +;CHECK-NOT: ld1 +;CHECK: fcvtzu.2s v0, v0 +;CHECK-NEXT: ret + %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> %A) + ret <2 x i32> %tmp3 +} + +define <4 x i32> @fcvtzu_4s_intrinsic(<4 x float> %A) nounwind { +;CHECK-LABEL: fcvtzu_4s_intrinsic: +;CHECK-NOT: ld1 +;CHECK: fcvtzu.4s v0, v0 +;CHECK-NEXT: ret + %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %A) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @fcvtzu_2d_intrinsic(<2 x double> %A) nounwind { +;CHECK-LABEL: fcvtzu_2d_intrinsic: +;CHECK-NOT: ld1 +;CHECK: fcvtzu.2d v0, v0 +;CHECK-NEXT: ret + %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %A) + ret <2 x i64> %tmp3 +} + +define <1 x i64> @fcvtzu_1d_intrinsic(<1 x double> %A) nounwind { +;CHECK-LABEL: fcvtzu_1d_intrinsic: +;CHECK-NOT: ld1 +;CHECK: fcvtzu d0, d0 +;CHECK-NEXT: ret + %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %A) + ret <1 x i64> %tmp3 +} + +declare <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float>) nounwind readnone +declare <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float>) nounwind readnone +declare <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double>) nounwind readnone + define <2 x float> @frinta_2s(<2 x float> %A) nounwind { ;CHECK-LABEL: frinta_2s: ;CHECK-NOT: ld1 diff --git a/llvm/test/CodeGen/AArch64/cfinv-def-nzcv.mir b/llvm/test/CodeGen/AArch64/cfinv-def-nzcv.mir new file mode 100644 index 0000000000000..cfcda7de52c52 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cfinv-def-nzcv.mir @@ -0,0 +1,17 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+11]]:29: missing implicit register operand 'implicit $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } + - { reg: '$x0' } +body: | + bb.0: + liveins: $w0, $x0 + + CFINV implicit-def $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/cfinv-use-nzcv.mir b/llvm/test/CodeGen/AArch64/cfinv-use-nzcv.mir new file mode 100644 index 0000000000000..4e3b3ead77155 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cfinv-use-nzcv.mir @@ -0,0 +1,17 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+11]]:25: missing implicit register operand 'implicit-def $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } + - { reg: '$x0' } +body: | + bb.0: + liveins: $w0, $x0 + + CFINV implicit $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir b/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir index e37d8be349480..4f144e1ef8bff 100644 --- a/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir +++ b/llvm/test/CodeGen/AArch64/falkor-hwpf-fix.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=aarch64-linux-gnu -mcpu=falkor -run-pass falkor-hwpf-fix-late -o - %s | FileCheck %s +# RUN: llc -mtriple=aarch64-linux-gnu -mcpu=falkor -run-pass aarch64-falkor-hwpf-fix-late -o - %s | FileCheck %s --- # Verify that the tag collision between the loads is resolved for various load opcodes. diff --git a/llvm/test/CodeGen/AArch64/falkor-hwpf.ll b/llvm/test/CodeGen/AArch64/falkor-hwpf.ll index b9c7291313fb9..aa4a43f2430ba 100644 --- a/llvm/test/CodeGen/AArch64/falkor-hwpf.ll +++ b/llvm/test/CodeGen/AArch64/falkor-hwpf.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s -; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF +; RUN: opt < %s -S -aarch64-falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s +; RUN: opt < %s -S -aarch64-falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF ; Check that strided access metadata is added to loads in inner loops when compiling for Falkor. diff --git a/llvm/test/CodeGen/AArch64/fjcvtzs.mir b/llvm/test/CodeGen/AArch64/fjcvtzs.mir new file mode 100644 index 0000000000000..efdee63669e29 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fjcvtzs.mir @@ -0,0 +1,17 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -mattr=+jsconv -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+11]]:49: missing implicit register operand 'implicit-def $nzcv' + +... +--- +name: test_jcvt +liveins: + - { reg: '$d0' } +body: | + bb.0: + liveins: $d0 + + renamable $w0 = FJCVTZS killed renamable $d0 + RET undef $lr, implicit killed $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll index c8333b253ec42..ff19e6ac91a73 100644 --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll @@ -16,6 +16,10 @@ declare i64 @llvm.aarch64.neon.fcvtau.i64.f16(half) declare i32 @llvm.aarch64.neon.fcvtau.i32.f16(half) declare i64 @llvm.aarch64.neon.fcvtas.i64.f16(half) declare i32 @llvm.aarch64.neon.fcvtas.i32.f16(half) +declare i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half) +declare i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half) +declare i64 @llvm.aarch64.neon.fcvtzu.i64.f16(half) +declare i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half) declare half @llvm.aarch64.neon.frsqrte.f16(half) declare half @llvm.aarch64.neon.frecpx.f16(half) declare half @llvm.aarch64.neon.frecpe.f16(half) @@ -138,6 +142,42 @@ entry: ret i64 %0 } +define i32 @fcvtzu_intrinsic_i32(half %a) { +; CHECK-LABEL: fcvtzu_intrinsic_i32: +; CHECK: fcvtzu w0, h0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half %a) + ret i32 %fcvt +} + +define i64 @fcvtzu_intrinsic_i64(half %a) { +; CHECK-LABEL: fcvtzu_intrinsic_i64: +; CHECK: fcvtzs x0, h0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half %a) + ret i64 %fcvt +} + +define i32 @fcvtzs_intrinsic_i32(half %a) { +; CHECK-LABEL: fcvtzs_intrinsic_i32: +; CHECK: fcvtzs w0, h0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half %a) + ret i32 %fcvt +} + +define i64 @fcvtzs_intrinsic_i64(half %a) { +; CHECK-LABEL: fcvtzs_intrinsic_i64: +; CHECK: fcvtzs x0, h0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half %a) + ret i64 %fcvt +} + define dso_local i16 @t19(half %a) { ; CHECK-LABEL: t19: ; CHECK: fcvtas w0, h0 diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 7903df64863bf..e6eb9e7a3d3ed 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -1,4 +1,8 @@ # RUN: llc -mattr=+sve -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -start-before=prologepilog %s -filetype=obj -o %t +# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO +# RUN: rm -rf %t # # Test allocation and deallocation of SVE objects on the stack, # as well as using a combination of scalable and non-scalable @@ -23,28 +27,28 @@ # --- | - define void @test_allocate_sve() nounwind { entry: unreachable } - define void @test_allocate_sve_gpr_callee_saves() nounwind { entry: unreachable } - define void @test_allocate_sve_gpr_realigned() nounwind { entry: unreachable } - define void @test_address_sve() nounwind { entry: unreachable } - define void @test_address_sve_fp() nounwind { entry: unreachable } - define void @test_stack_arg_sve() nounwind { entry: unreachable } - define void @test_address_sve_out_of_range() nounwind { entry: unreachable } - define void @test_address_gpr_vla() nounwind { entry: unreachable } - define aarch64_sve_vector_pcs void @save_restore_pregs_sve() nounwind { entry: unreachable } - define aarch64_sve_vector_pcs void @save_restore_zregs_sve() nounwind { entry: unreachable } - define aarch64_sve_vector_pcs void @save_restore_sve() nounwind { entry: unreachable } - define aarch64_sve_vector_pcs void @save_restore_sve_realign() nounwind { entry: unreachable } - define aarch64_sve_vector_pcs void @frame_layout() nounwind { entry: unreachable } + define void @test_allocate_sve() { entry: unreachable } + define void @test_allocate_sve_gpr_callee_saves() { entry: unreachable } + define void @test_allocate_sve_gpr_realigned() { entry: unreachable } + define void @test_address_sve() { entry: unreachable } + define void @test_address_sve_fp() { entry: unreachable } + define void @test_stack_arg_sve() { entry: unreachable } + define void @test_address_sve_out_of_range() { entry: unreachable } + define void @test_address_gpr_vla() { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_pregs_sve() { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_zregs_sve() { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_sve() { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_sve_realign() { entry: unreachable } + define aarch64_sve_vector_pcs void @frame_layout() { entry: unreachable } ... # +----------+ # |scratchreg| // x29 is used as scratch reg. # +----------+ -# | %fixed- | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, -# | stack.0 | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) +# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) # +----------+ -# | %stack.0 | // not scalable +# | %stack.1 | // not scalable # +----------+ <- SP # CHECK-LABEL: name: test_allocate_sve @@ -54,16 +58,23 @@ # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 # CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16 # CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_allocate_sve: +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 name: test_allocate_sve -fixedStack: - - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 } + - { id: 1, stack-id: default, size: 16, alignment: 8 } body: | bb.0.entry: RET_ReallyLR @@ -73,10 +84,9 @@ body: | # | x20, x21 | // callee saves # |scratchreg| // x29 is used as scratch reg. # +----------+ -# | %fixed- | // scalable objects -# | stack.0 | +# | %stack.0 | // scalable objects # +----------+ -# | %stack.0 | // not scalable +# | %stack.1 | // not scalable # +----------+ <- SP # CHECK-LABEL: name: test_allocate_sve_gpr_callee_saves @@ -87,6 +97,8 @@ body: | # CHECK-NEXT: frame-setup STPXi killed $x21, killed $x20, $sp, 2 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-COUNT-4: frame-setup CFI_INSTRUCTION +# # CHECK-NEXT: $x20 = IMPLICIT_DEF # CHECK-NEXT: $x21 = IMPLICIT_DEF # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 @@ -94,11 +106,21 @@ body: | # CHECK-NEXT: $x21, $x20 = frame-destroy LDPXi $sp, 2 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 32 # CHECK-NEXT: RET_ReallyLR +# +# ASM-LABEL: test_allocate_sve_gpr_callee_saves: +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG +# ASM-NEXT: .cfi_offset w20, -8 +# ASM-NEXT: .cfi_offset w21, -16 +# ASM-NEXT: .cfi_offset w29, -32 +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg20 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg21 -16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -32 name: test_allocate_sve_gpr_callee_saves -fixedStack: - - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 } + - { id: 1, stack-id: default, size: 16, alignment: 8 } body: | bb.0.entry: $x20 = IMPLICIT_DEF @@ -109,11 +131,10 @@ body: | # +----------+ # | lr, fp | // frame record # +----------+ <- FP -# | %fixed- | // scalable objects -# | stack.0 | +# | %stack.0 | // scalable objects # +----------+ # |//////////| // alignment gap -# | %stack.0 | // not scalable +# | %stack.1 | // not scalable # +----------+ <- SP # CHECK-LABEL: name: test_allocate_sve_gpr_realigned # CHECK: stackSize: 32 @@ -124,14 +145,24 @@ body: | # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-COUNT-3: frame-setup CFI_INSTRUCTION # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 # CHECK-NEXT: RET_ReallyLR +# +# ASM-LABEL: test_allocate_sve_gpr_realigned: +# ASM: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 + name: test_allocate_sve_gpr_realigned -fixedStack: - - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 32 } + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 } + - { id: 1, stack-id: default, size: 16, alignment: 32 } body: | bb.0.entry: RET_ReallyLR @@ -144,7 +175,7 @@ body: | # | %stack.1 | // scalable @ SP + 16b + 16 scalable bytes # | %stack.2 | // scalable @ SP + 16b + 14 scalable bytes # +----------+ -# | %stack.0 | // not scalable +# | %stack.3 | // not scalable # +----------+ <- SP # CHECK-LABEL: name: test_address_sve @@ -154,6 +185,7 @@ body: | # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 16 # CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 2 @@ -166,22 +198,29 @@ body: | # CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16 # CHECK-NEXT: RET_ReallyLR +# +# ASM-LABEL: test_address_sve: +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 24 * VG +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 + name: test_address_sve frameInfo: maxAlignment: 16 -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 } - - { id: 1, stack-id: sve-vec, size: 16, alignment: 8, offset: -32 } - - { id: 2, stack-id: sve-vec, size: 2, alignment: 2, offset: -34 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 1, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 2, stack-id: sve-vec, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } body: | bb.0.entry: liveins: $z0, $z1, $p0 - STR_ZXI $z0, %fixed-stack.0, 0 - STR_ZXI $z1, %fixed-stack.1, 0 - STR_PXI $p0, %fixed-stack.2, 0 + STR_ZXI $z0, %stack.0, 0 + STR_ZXI $z1, %stack.1, 0 + STR_PXI $p0, %stack.2, 0 RET_ReallyLR --- @@ -190,11 +229,11 @@ body: | # | x20, x21 | // callee saves # | lr, fp | // frame record # +-----------+ <- FP -# | %fstack.0 | // scalable @ FP - 16 scalable bytes -# | %fstack.1 | // scalable @ FP - 32 scalable bytes -# | %fstack.2 | // scalable @ FP - 34 scalable bytes +# | %stack.0 | // scalable @ FP - 16 scalable bytes +# | %stack.1 | // scalable @ FP - 32 scalable bytes +# | %stack.2 | // scalable @ FP - 34 scalable bytes # +-----------+ -# | %stack.0 | // not scalable +# | %stack.3 | // not scalable # +-----------+ <- SP # CHECK-LABEL: name: test_address_sve_fp @@ -205,6 +244,7 @@ body: | # CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-COUNT-3: frame-setup CFI_INSTRUCTION # CHECK-NEXT: STR_ZXI $z0, $fp, -1 # CHECK-NEXT: STR_ZXI $z1, $fp, -2 @@ -214,23 +254,31 @@ body: | # CHECK: $sp = frame-destroy ADDXri $sp, 16, 0 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 # CHECK-NEXT: RET_ReallyLR +# +# ASM-LABEL: test_address_sve_fp: +# ASM: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 name: test_address_sve_fp frameInfo: maxAlignment: 16 isFrameAddressTaken: true -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 } - - { id: 1, stack-id: sve-vec, size: 16, alignment: 8, offset: -32 } - - { id: 2, stack-id: sve-vec, size: 2, alignment: 2, offset: -34 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 1, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 2, stack-id: sve-vec, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } body: | bb.0.entry: liveins: $z0, $z1, $p0 - STR_ZXI $z0, %fixed-stack.0, 0 - STR_ZXI $z1, %fixed-stack.1, 0 - STR_PXI $p0, %fixed-stack.2, 0 + STR_ZXI $z0, %stack.0, 0 + STR_ZXI $z1, %stack.1, 0 + STR_PXI $p0, %stack.2, 0 RET_ReallyLR --- @@ -240,9 +288,9 @@ body: | # +-----------+ # |callee save| // register saved as scratch reg. # +-----------+ -# | %fstack.1 | // vector of 16 scalable bytes +# | %stack.0 | // vector of 16 scalable bytes # +---------- + -# | %stack.0 | // not scalable, 16 bytes +# | %stack.1 | // not scalable, 16 bytes # +-----------+ <- SP # CHECK-LABEL: name: test_stack_arg_sve # CHECK: stackSize: 32 @@ -251,6 +299,7 @@ body: | # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION # CHECK: $[[TMP:x[0-9]+]] = ADDVL_XXI $sp, 1 # CHECK-NEXT: $x0 = LDRXui killed $[[TMP]], 4 @@ -259,12 +308,20 @@ body: | # CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16 # CHECK-NEXT: RET_ReallyLR +# +# ASM-LABEL: test_stack_arg_sve: +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 + name: test_stack_arg_sve fixedStack: - { id: 0, stack-id: default, size: 16, alignment: 16, offset: 0 } - - { id: 1, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 16 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } + - { id: 1, stack-id: default, size: 16, alignment: 16 } body: | bb.0.entry: liveins: $x0 @@ -299,6 +356,7 @@ body: | # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -32 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -32 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 +# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION # CHECK-NEXT: $[[TMP2:x[0-9]+]] = ADDVL_XXI $sp, 1 # CHECK-NEXT: STR_ZXI $z0, killed $[[TMP2]], 255 @@ -317,20 +375,27 @@ body: | # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 9 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16 # CHECK-NEXT: RET_ReallyLR +# +# ASM-LABEL: test_address_sve_out_of_range: +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2056 * VG +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +2056, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 name: test_address_sve_out_of_range frameInfo: maxAlignment: 16 -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } - - { id: 1, stack-id: sve-vec, size: 3584, alignment: 16, offset: -3600 } - - { id: 2, stack-id: sve-vec, size: 512, alignment: 16, offset: -4112 } +stack: + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } + - { id: 1, stack-id: sve-vec, size: 3584, alignment: 16 } + - { id: 2, stack-id: sve-vec, size: 512, alignment: 16 } body: | bb.0.entry: liveins: $z0, $p0 - STR_ZXI $z0, %fixed-stack.0, 0 - STR_PXI $p0, %fixed-stack.1, 0 + STR_ZXI $z0, %stack.0, 0 + STR_PXI $p0, %stack.1, 0 RET_ReallyLR --- @@ -340,30 +405,40 @@ body: | # access from the FP when there are also SVE objects on the stack. # # +----------+ <- FP -# | %fstack.0| // 16 scalable bytes +# | %stack.0 | // 16 scalable bytes # +----------+ <- @FP - 16 scalable bytes -# | %stack.0 | // 16 bytes +# | %stack.1 | // 16 bytes # +----------+ <- @BP -# : %stack.1 : // variable length +# : %stack.2 : // variable length # +----------+ <- SP # CHECK-LABEL: name: test_address_gpr_vla # CHECK: bb.0.entry: # CHECK: STRXui $xzr, $x19, 0 # CHECK: RET_ReallyLR +# +# ASM-LABEL: test_address_gpr_vla: +# ASM: .cfi_def_cfa w29, 32 +# ASM-NEXT: .cfi_offset w19, -16 +# ASM-NEXT: .cfi_offset w30, -24 +# ASM-NEXT: .cfi_offset w29, -32 +# +# UNWINDINFO: DW_CFA_def_cfa: reg29 +32 +# UNWINDINFO-NEXT: DW_CFA_offset: reg19 -16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -24 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -32 name: test_address_gpr_vla frameInfo: maxAlignment: 16 -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 8 } - - { id: 1, stack-id: default, type: variable-sized } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 } + - { id: 1, stack-id: default, size: 16, alignment: 8 } + - { id: 2, stack-id: default, type: variable-sized } body: | bb.0.entry: liveins: $xzr - STRXui $xzr, %stack.0, 0 + STRXui $xzr, %stack.1, 0 RET_ReallyLR --- @@ -374,6 +449,7 @@ body: | # CHECK: frame-setup STR_PXI killed $p5, $sp, 6 # CHECK: frame-setup STR_PXI killed $p4, $sp, 7 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0 +# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 # CHECK: $p6 = frame-destroy LDR_PXI $sp, 5 @@ -381,6 +457,13 @@ body: | # CHECK: $p4 = frame-destroy LDR_PXI $sp, 7 # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1 # CHECK: RET_ReallyLR +# +# ASM-LABEL: save_restore_pregs_sve: +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 8 * VG +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 name: save_restore_pregs_sve stack: - { id: 0, stack-id: default, size: 32, alignment: 16 } @@ -395,18 +478,34 @@ body: | --- ... # CHECK-LABEL: name: save_restore_zregs_sve -# CHECK: $sp = frame-setup ADDVL_XXI $sp, -3 -# CHECK: frame-setup STR_ZXI killed $z10, $sp, 0 -# CHECK: frame-setup STR_ZXI killed $z9, $sp, 1 -# CHECK: frame-setup STR_ZXI killed $z8, $sp, 2 -# CHECK: $sp = frame-setup SUBXri $sp, 32, 0 +# CHECK: $sp = frame-setup STRXpre killed $fp, $sp, -16 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 +# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0 +# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1 +# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 32, 0 +# CHECK-COUNT-5: frame-setup CFI_INSTRUCTION + +# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 +# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 +# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3 +# CHECK-NEXT: $sp, $fp = frame-destroy LDRXpost $sp, 16 +# CHECK-NEXT: RET_ReallyLR +# +# ASM-LABEL: save_restore_zregs_sve: +# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 24 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG + +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 -# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 -# CHECK: $z10 = frame-destroy LDR_ZXI $sp, 0 -# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 1 -# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 2 -# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 3 -# CHECK: RET_ReallyLR name: save_restore_zregs_sve stack: - { id: 0, stack-id: default, size: 32, alignment: 16 } @@ -429,7 +528,7 @@ body: | # CHECK-LABEL: name: save_restore_sve # CHECK: $sp = frame-setup STPXpre killed ${{[a-z0-9]+}}, killed $x21, $sp, -4 # CHECK: frame-setup STPXi killed $x20, killed $x19, $sp, 2 -# CHECK: $sp = frame-setup ADDVL_XXI $sp, -19 +# CHECK: $sp = frame-setup ADDVL_XXI $sp, -18 # CHECK: frame-setup STR_PXI killed $p15, $sp, 4 # CHECK: frame-setup STR_PXI killed $p14, $sp, 5 # CHECK: frame-setup STR_PXI killed $p5, $sp, 14 @@ -438,9 +537,12 @@ body: | # CHECK: frame-setup STR_ZXI killed $z22, $sp, 3 # CHECK: frame-setup STR_ZXI killed $z9, $sp, 16 # CHECK: frame-setup STR_ZXI killed $z8, $sp, 17 +# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0 +# CHECK-COUNT-13: frame-setup CFI_INSTRUCTION # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 +# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1 # CHECK: $p15 = frame-destroy LDR_PXI $sp, 4 # CHECK: $p14 = frame-destroy LDR_PXI $sp, 5 # CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 @@ -449,15 +551,44 @@ body: | # CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3 # CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16 # CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17 -# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 19 +# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 18 # CHECK: $x20, $x19 = frame-destroy LDPXi $sp, 2 # CHECK: $sp, ${{[a-z0-9]+}}, $x21 = frame-destroy LDPXpost $sp, 4 # CHECK: RET_ReallyLR +# +# ASM-LABEL: save_restore_sve: +# ASM: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 152 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 32 - 32 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 32 - 40 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG +# ASM-NEXT: .cfi_offset w19, -8 +# ASM-NEXT: .cfi_offset w20, -16 +# ASM-NEXT: .cfi_offset w21, -24 +# ASM-NEXT: .cfi_offset w29, -32 +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -40, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -48, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -56, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -64, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg19 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg20 -16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg21 -24 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -32 + name: save_restore_sve -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } stack: - - { id: 0, stack-id: default, size: 32, alignment: 16 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } + - { id: 1, stack-id: default, size: 32, alignment: 16 } body: | bb.0.entry: @@ -494,7 +625,7 @@ body: | # CHECK-LABEL: name: save_restore_sve_realign # CHECK: $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 # CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -19 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -18 # CHECK-NEXT: STR_PXI killed $p15, $sp, 4 # CHECK-NEXT: STR_PXI killed $p14, $sp, 5 # CHECK: STR_PXI killed $p5, $sp, 14 @@ -503,10 +634,12 @@ body: | # CHECK-NEXT: STR_ZXI killed $z22, $sp, 3 # CHECK: STR_ZXI killed $z9, $sp, 16 # CHECK-NEXT: STR_ZXI killed $z8, $sp, 17 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-COUNT-11: frame-setup CFI_INSTRUCTION -# CHECK: $sp = frame-destroy ADDVL_XXI $fp, -19 +# CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5 # CHECK: $p5 = frame-destroy LDR_PXI $sp, 14 @@ -518,11 +651,35 @@ body: | # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 # CHECK-NEXT: RET_ReallyLR +# +# ASM-LABEL: save_restore_sve_realign: +# ASM: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -40, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -48, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -56, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -64, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 name: save_restore_sve_realign -fixedStack: - - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 } stack: - - { id: 0, stack-id: default, size: 16, alignment: 32 } + - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 } + - { id: 1, stack-id: default, size: 16, alignment: 32 } body: | bb.0.entry: @@ -580,7 +737,7 @@ body: | # CHECK-NEXT: stack-id: sve-vec, callee-saved-register: '$z23', # CHECK: - { id: 8, name: '', type: spill-slot, offset: -34, size: 2, alignment: 2, # CHECK-NEXT: stack-id: sve-vec, callee-saved-register: '$p4', -# CHECK: - { id: 9, name: '', type: spill-slot, offset: -48, size: 2, alignment: 16, +# CHECK: - { id: 9, name: '', type: spill-slot, offset: -36, size: 2, alignment: 2, # CHECK-NEXT: stack-id: sve-vec, callee-saved-register: '$p15', # CHECK: - { id: 10, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16, # CHECK-NEXT: stack-id: default, callee-saved-register: '$fp', @@ -593,6 +750,15 @@ body: | # CHECK-NEXT: STR_ZXI killed $z23, $sp, 1 # CHECK-NEXT: STR_ZXI killed $z8, $sp, 2 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -7 +# CHECK-COUNT-3: frame-setup CFI_INSTRUCTION +# ASM-LABEL: frame_layout: +# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 80 * VG +# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +# ASM-NEXT: .cfi_offset w29, -16 +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +80, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 name: frame_layout stack: - { id: 0, type: default, size: 32, alignment: 16, stack-id: sve-vec } diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll index 816465f9eaa17..9f3a77c8fe92b 100644 --- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll +++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll @@ -182,7 +182,7 @@ define @urem_i64( %a, %b ; SMIN ; -define @smin_i8( %a, %b, %c) { +define @smin_i8( %a, %b) { ; CHECK-LABEL: smin_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b @@ -193,7 +193,7 @@ define @smin_i8( %a, %b, ret %min } -define @smin_i16( %a, %b, %c) { +define @smin_i16( %a, %b) { ; CHECK-LABEL: smin_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -204,7 +204,7 @@ define @smin_i16( %a, %b ret %min } -define @smin_i32( %a, %b, %c) { +define @smin_i32( %a, %b) { ; CHECK-LABEL: smin_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -215,7 +215,7 @@ define @smin_i32( %a, %b ret %min } -define @smin_i64( %a, %b, %c) { +define @smin_i64( %a, %b) { ; CHECK-LABEL: smin_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -226,7 +226,7 @@ define @smin_i64( %a, %b ret %min } -define @smin_split_i8( %a, %b, %c) { +define @smin_split_i8( %a, %b) { ; CHECK-LABEL: smin_split_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b @@ -238,7 +238,7 @@ define @smin_split_i8( %a, %min } -define @smin_split_i16( %a, %b, %c) { +define @smin_split_i16( %a, %b) { ; CHECK-LABEL: smin_split_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -252,7 +252,7 @@ define @smin_split_i16( %a, %min } -define @smin_split_i32( %a, %b, %c) { +define @smin_split_i32( %a, %b) { ; CHECK-LABEL: smin_split_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -264,7 +264,7 @@ define @smin_split_i32( %a, %min } -define @smin_split_i64( %a, %b, %c) { +define @smin_split_i64( %a, %b) { ; CHECK-LABEL: smin_split_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -276,7 +276,7 @@ define @smin_split_i64( %a, %min } -define @smin_promote_i8( %a, %b, %c) { +define @smin_promote_i8( %a, %b) { ; CHECK-LABEL: smin_promote_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -289,7 +289,7 @@ define @smin_promote_i8( %a, %min } -define @smin_promote_i16( %a, %b, %c) { +define @smin_promote_i16( %a, %b) { ; CHECK-LABEL: smin_promote_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -302,7 +302,7 @@ define @smin_promote_i16( %a, %min } -define @smin_promote_i32( %a, %b, %c) { +define @smin_promote_i32( %a, %b) { ; CHECK-LABEL: smin_promote_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -319,7 +319,7 @@ define @smin_promote_i32( %a, @umin_i8( %a, %b, %c) { +define @umin_i8( %a, %b) { ; CHECK-LABEL: umin_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b @@ -330,7 +330,7 @@ define @umin_i8( %a, %b, ret %min } -define @umin_i16( %a, %b, %c) { +define @umin_i16( %a, %b) { ; CHECK-LABEL: umin_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -341,7 +341,7 @@ define @umin_i16( %a, %b ret %min } -define @umin_i32( %a, %b, %c) { +define @umin_i32( %a, %b) { ; CHECK-LABEL: umin_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -352,7 +352,7 @@ define @umin_i32( %a, %b ret %min } -define @umin_i64( %a, %b, %c) { +define @umin_i64( %a, %b) { ; CHECK-LABEL: umin_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -363,7 +363,7 @@ define @umin_i64( %a, %b ret %min } -define @umin_split_i64( %a, %b, %c) { +define @umin_split_i64( %a, %b) { ; CHECK-LABEL: umin_split_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -375,7 +375,7 @@ define @umin_split_i64( %a, %min } -define @umin_promote_i8( %a, %b, %c) { +define @umin_promote_i8( %a, %b) { ; CHECK-LABEL: umin_promote_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -392,7 +392,7 @@ define @umin_promote_i8( %a, @smax_i8( %a, %b, %c) { +define @smax_i8( %a, %b) { ; CHECK-LABEL: smax_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b @@ -403,7 +403,7 @@ define @smax_i8( %a, %b, ret %max } -define @smax_i16( %a, %b, %c) { +define @smax_i16( %a, %b) { ; CHECK-LABEL: smax_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -414,7 +414,7 @@ define @smax_i16( %a, %b ret %max } -define @smax_i32( %a, %b, %c) { +define @smax_i32( %a, %b) { ; CHECK-LABEL: smax_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -425,7 +425,7 @@ define @smax_i32( %a, %b ret %max } -define @smax_i64( %a, %b, %c) { +define @smax_i64( %a, %b) { ; CHECK-LABEL: smax_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -436,7 +436,7 @@ define @smax_i64( %a, %b ret %max } -define @smax_split_i32( %a, %b, %c) { +define @smax_split_i32( %a, %b) { ; CHECK-LABEL: smax_split_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -448,7 +448,7 @@ define @smax_split_i32( %a, %max } -define @smax_promote_i16( %a, %b, %c) { +define @smax_promote_i16( %a, %b) { ; CHECK-LABEL: smax_promote_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -465,7 +465,7 @@ define @smax_promote_i16( %a, @umax_i8( %a, %b, %c) { +define @umax_i8( %a, %b) { ; CHECK-LABEL: umax_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b @@ -476,7 +476,7 @@ define @umax_i8( %a, %b, ret %max } -define @umax_i16( %a, %b, %c) { +define @umax_i16( %a, %b) { ; CHECK-LABEL: umax_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -487,7 +487,7 @@ define @umax_i16( %a, %b ret %max } -define @umax_i32( %a, %b, %c) { +define @umax_i32( %a, %b) { ; CHECK-LABEL: umax_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s @@ -498,7 +498,7 @@ define @umax_i32( %a, %b ret %max } -define @umax_i64( %a, %b, %c) { +define @umax_i64( %a, %b) { ; CHECK-LABEL: umax_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -509,7 +509,7 @@ define @umax_i64( %a, %b ret %max } -define @umax_split_i16( %a, %b, %c) { +define @umax_split_i16( %a, %b) { ; CHECK-LABEL: umax_split_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h @@ -521,7 +521,7 @@ define @umax_split_i16( %a, %max } -define @umax_promote_i32( %a, %b, %c) { +define @umax_promote_i32( %a, %b) { ; CHECK-LABEL: umax_promote_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d @@ -601,6 +601,50 @@ define @asr_promote_i32( %a, %shr } +; +; ASRR +; + +define @asrr_i8( %a, %b){ +; CHECK-LABEL: asrr_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: asrr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %shr = ashr %b, %a + ret %shr +} + +define @asrr_i16( %a, %b){ +; CHECK-LABEL: asrr_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: asrr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %shr = ashr %b, %a + ret %shr +} + +define @asrr_i32( %a, %b){ +; CHECK-LABEL: asrr_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %shr = ashr %b, %a + ret %shr +} + +define @asrr_i64( %a, %b){ +; CHECK-LABEL: asrr_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asrr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %shr = ashr %b, %a + ret %shr +} + ; ; LSL ; @@ -667,6 +711,50 @@ define @lsl_promote_i16( %a, %shl } +; +; LSLR +; + +define @lslr_i8( %a, %b){ +; CHECK-LABEL: lslr_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: lslr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %shl = shl %b, %a + ret %shl +} + +define @lslr_i16( %a, %b){ +; CHECK-LABEL: lslr_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: lslr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %shl = shl %b, %a + ret %shl +} + +define @lslr_i32( %a, %b){ +; CHECK-LABEL: lslr_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lslr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %shl = shl %b, %a + ret %shl +} + +define @lslr_i64( %a, %b){ +; CHECK-LABEL: lslr_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lslr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %shl = shl %b, %a + ret %shl +} + ; ; LSR ; @@ -734,6 +822,50 @@ define @lsr_split_i32( %a, %shr } +; +; LSRR +; + +define @lsrr_i8( %a, %b){ +; CHECK-LABEL: lsrr_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: lsrr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %shr = lshr %b, %a + ret %shr +} + +define @lsrr_i16( %a, %b){ +; CHECK-LABEL: lsrr_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: lsrr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %shr = lshr %b, %a + ret %shr +} + +define @lsrr_i32( %a, %b){ +; CHECK-LABEL: lsrr_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsrr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %shr = lshr %b, %a + ret %shr +} + +define @lsrr_i64( %a, %b){ +; CHECK-LABEL: lsrr_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lsrr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %shr = lshr %b, %a + ret %shr +} + ; ; CMP ; diff --git a/llvm/test/CodeGen/AArch64/lrint-conv.ll b/llvm/test/CodeGen/AArch64/lrint-conv.ll index 14d078b96ff1c..80f1e8b8fc18e 100644 --- a/llvm/test/CodeGen/AArch64/lrint-conv.ll +++ b/llvm/test/CodeGen/AArch64/lrint-conv.ll @@ -1,9 +1,11 @@ ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s +; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=aarch64 | FileCheck %s --check-prefixes=FALLBACK,CHECK ; CHECK-LABEL: testmsws: ; CHECK: frintx [[REG:s[0-9]]], s0 ; CHECK-NEXT: fcvtzs x0, [[REG]] ; CHECK: ret +; FALLBACK-NOT: remark{{.*}}testmsws define i32 @testmsws(float %x) { entry: %0 = tail call i64 @llvm.lrint.i64.f32(float %x) @@ -15,6 +17,7 @@ entry: ; CHECK: frintx [[REG:s[0-9]]], s0 ; CHECK-NEXT: fcvtzs x0, [[REG]] ; CHECK-NEXT: ret +; FALLBACK-NOT: remark{{.*}}testmsxs define i64 @testmsxs(float %x) { entry: %0 = tail call i64 @llvm.lrint.i64.f32(float %x) @@ -25,6 +28,7 @@ entry: ; CHECK: frintx [[REG:d[0-9]]], d0 ; CHECK-NEXT: fcvtzs x0, [[REG]] ; CHECK: ret +; FALLBACK-NOT: remark{{.*}}testmswd define i32 @testmswd(double %x) { entry: %0 = tail call i64 @llvm.lrint.i64.f64(double %x) @@ -36,6 +40,7 @@ entry: ; CHECK: frintx [[REG:d[0-9]]], d0 ; CHECK-NEXT: fcvtzs x0, [[REG]] ; CHECK-NEXT: ret +; FALLBACK-NOT: remark{{.*}}testmsxd define i64 @testmsxd(double %x) { entry: %0 = tail call i64 @llvm.lrint.i64.f64(double %x) diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-instr-fmf.mir b/llvm/test/CodeGen/AArch64/machine-combiner-instr-fmf.mir new file mode 100644 index 0000000000000..992e636011b6b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-combiner-instr-fmf.mir @@ -0,0 +1,399 @@ +# RUN: llc -run-pass=machine-combiner -o - -simplify-mir -mtriple=arm64-apple-iphoneos %s | FileCheck %s + +# Can create FMADD, because both the fmul and fadd have all fast-math flags. +# +# CHECK-LABEL: name: scalar_fmadd_fast +# CHECK: [[C:%.*]]:fpr32 = COPY $s2 +# CHECK-NEXT: [[B:%.*]]:fpr32 = COPY $s1 +# CHECK-NEXT: [[A:%.*]]:fpr32 = COPY $s0 +# CHECK-NEXT: :fpr32 = FMADDSrrr [[B]], [[A]], [[C]] +--- +name: scalar_fmadd_fast +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr32 } + - { id: 1, class: fpr32 } + - { id: 2, class: fpr32 } + - { id: 3, class: fpr32 } + - { id: 4, class: fpr32 } +liveins: + - { reg: '$s0', virtual-reg: '%0' } + - { reg: '$s1', virtual-reg: '%1' } + - { reg: '$s2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $s0, $s1, $s2 + + %2:fpr32 = COPY $s2 + %1:fpr32 = COPY $s1 + %0:fpr32 = COPY $s0 + %3:fpr32 = nnan ninf nsz arcp contract afn reassoc FMULSrr %1, %0 + %4:fpr32 = nnan ninf nsz arcp contract afn reassoc FADDSrr killed %3, %2 + $s0 = COPY %4 + RET_ReallyLR implicit $s0 + +... + +# Can create FMADD, because both the fmul and fadd have the contract fast-math flag. +# +# CHECK-LABEL: name: scalar_fmadd_contract +# CHECK: [[C:%.*]]:fpr32 = COPY $s2 +# CHECK-NEXT: [[B:%.*]]:fpr32 = COPY $s1 +# CHECK-NEXT: [[A:%.*]]:fpr32 = COPY $s0 +# CHECK-NEXT: :fpr32 = FMADDSrrr [[B]], [[A]], [[C]] + +--- +name: scalar_fmadd_contract +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr32 } + - { id: 1, class: fpr32 } + - { id: 2, class: fpr32 } + - { id: 3, class: fpr32 } + - { id: 4, class: fpr32 } +liveins: + - { reg: '$s0', virtual-reg: '%0' } + - { reg: '$s1', virtual-reg: '%1' } + - { reg: '$s2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $s0, $s1, $s2 + + %2:fpr32 = COPY $s2 + %1:fpr32 = COPY $s1 + %0:fpr32 = COPY $s0 + %3:fpr32 = contract FMULSrr %1, %0 + %4:fpr32 = contract FADDSrr killed %3, %2 + $s0 = COPY %4 + RET_ReallyLR implicit $s0 + +... + +# Do not create FMADD, because we don't have the contract flag on the FADD. + +# CHECK-LABEL: name: scalar_fmadd_contract_op0 +# CHECK: [[C:%.*]]:fpr32 = COPY $s2 +# CHECK-NEXT: [[B:%.*]]:fpr32 = COPY $s1 +# CHECK-NEXT: [[A:%.*]]:fpr32 = COPY $s0 +# CHECK-NEXT: [[MUL:%.*]]:fpr32 = contract FMULSrr [[B]], [[A]] +# CHECK-NEXT: fpr32 = FADDSrr killed [[MUL]], [[C]] +--- +name: scalar_fmadd_contract_op0 +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr32 } + - { id: 1, class: fpr32 } + - { id: 2, class: fpr32 } + - { id: 3, class: fpr32 } + - { id: 4, class: fpr32 } +liveins: + - { reg: '$s0', virtual-reg: '%0' } + - { reg: '$s1', virtual-reg: '%1' } + - { reg: '$s2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $s0, $s1, $s2 + + %2:fpr32 = COPY $s2 + %1:fpr32 = COPY $s1 + %0:fpr32 = COPY $s0 + %3:fpr32 = contract FMULSrr %1, %0 + %4:fpr32 = FADDSrr killed %3, %2 + $s0 = COPY %4 + RET_ReallyLR implicit $s0 + +... + +# Do create FMADD, because we have the contract flag on the FADD. +# +# CHECK-LABEL: name: scalar_fmadd_contract_op1 +# CHECK: [[C:%.*]]:fpr32 = COPY $s2 +# CHECK-NEXT: [[B:%.*]]:fpr32 = COPY $s1 +# CHECK-NEXT: [[A:%.*]]:fpr32 = COPY $s0 +# CHECK-NEXT: :fpr32 = FMADDSrrr [[B]], [[A]], [[C]] + +--- +name: scalar_fmadd_contract_op1 +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr32 } + - { id: 1, class: fpr32 } + - { id: 2, class: fpr32 } + - { id: 3, class: fpr32 } + - { id: 4, class: fpr32 } +liveins: + - { reg: '$s0', virtual-reg: '%0' } + - { reg: '$s1', virtual-reg: '%1' } + - { reg: '$s2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $s0, $s1, $s2 + + %2:fpr32 = COPY $s2 + %1:fpr32 = COPY $s1 + %0:fpr32 = COPY $s0 + %3:fpr32 = FMULSrr %1, %0 + %4:fpr32 = contract FADDSrr killed %3, %2 + $s0 = COPY %4 + RET_ReallyLR implicit $s0 + +... + +# Do not create FMADD, as nsz flag does not allow it. +# +# CHECK-LABEL: name: scalar_fmadd_nsz +# CHECK: [[C:%.*]]:fpr32 = COPY $s2 +# CHECK-NEXT: [[B:%.*]]:fpr32 = COPY $s1 +# CHECK-NEXT: [[A:%.*]]:fpr32 = COPY $s0 +# CHECK-NEXT: [[MUL:%.*]]:fpr32 = nsz FMULSrr [[B]], [[A]] +# CHECK-NEXT: fpr32 = nsz FADDSrr killed [[MUL]], [[C]] + +--- +name: scalar_fmadd_nsz +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr32 } + - { id: 1, class: fpr32 } + - { id: 2, class: fpr32 } + - { id: 3, class: fpr32 } + - { id: 4, class: fpr32 } +liveins: + - { reg: '$s0', virtual-reg: '%0' } + - { reg: '$s1', virtual-reg: '%1' } + - { reg: '$s2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $s0, $s1, $s2 + + %2:fpr32 = COPY $s2 + %1:fpr32 = COPY $s1 + %0:fpr32 = COPY $s0 + %3:fpr32 = nsz FMULSrr %1, %0 + %4:fpr32 = nsz FADDSrr killed %3, %2 + $s0 = COPY %4 + RET_ReallyLR implicit $s0 + +... + +# Can create FMLA, because both the fmul and fadd have all fast-math flags. +# +# CHECK-LABEL: name: vector_fmadd_fast +# CHECK: [[C:%.*]]:fpr128 = COPY $q2 +# CHECK-NEXT: [[B:%.*]]:fpr128 = COPY $q1 +# CHECK-NEXT: [[A:%.*]]:fpr128 = COPY $q0 +# CHECK-NEXT: fpr128 = FMLAv2f64 [[C]], [[B]], [[A]] +--- +name: vector_fmadd_fast +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } +liveins: + - { reg: '$q0', virtual-reg: '%0' } + - { reg: '$q1', virtual-reg: '%1' } + - { reg: '$q2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + %2:fpr128 = COPY $q2 + %1:fpr128 = COPY $q1 + %0:fpr128 = COPY $q0 + %3:fpr128 = nnan ninf nsz arcp contract afn reassoc FMULv2f64 %1, %0 + %4:fpr128 = nnan ninf nsz arcp contract afn reassoc FADDv2f64 killed %3, %2 + $q0 = COPY %4 + RET_ReallyLR implicit $q0 + +... + +# Can create FMLA, because both the fmul and fadd have the contract fast-math flag. +# +# CHECK-LABEL: name: vector_fmadd_contract +# CHECK: [[C:%.*]]:fpr128 = COPY $q2 +# CHECK-NEXT: [[B:%.*]]:fpr128 = COPY $q1 +# CHECK-NEXT: [[A:%.*]]:fpr128 = COPY $q0 +# CHECK-NEXT: fpr128 = FMLAv2f64 [[C]], [[B]], [[A]] +--- +name: vector_fmadd_contract +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } +liveins: + - { reg: '$q0', virtual-reg: '%0' } + - { reg: '$q1', virtual-reg: '%1' } + - { reg: '$q2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + %2:fpr128 = COPY $q2 + %1:fpr128 = COPY $q1 + %0:fpr128 = COPY $q0 + %3:fpr128 = contract FMULv2f64 %1, %0 + %4:fpr128 = contract FADDv2f64 killed %3, %2 + $q0 = COPY %4 + RET_ReallyLR implicit $q0 + +... + +# Do not create FMLA, because we don't have the contract flag on the FADD. +# +# CHECK-LABEL: name: vector_fmadd_contract_op0 +# CHECK: [[C:%.*]]:fpr128 = COPY $q2 +# CHECK-NEXT: [[B:%.*]]:fpr128 = COPY $q1 +# CHECK-NEXT: [[A:%.*]]:fpr128 = COPY $q0 +# CHECK-NEXT: [[MUL:%.*]]:fpr128 = contract FMULv2f64 [[B]], [[A]] +# CHECK-NEXT: fpr128 = FADDv2f64 killed [[MUL]], [[C]] +--- +name: vector_fmadd_contract_op0 +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } +liveins: + - { reg: '$q0', virtual-reg: '%0' } + - { reg: '$q1', virtual-reg: '%1' } + - { reg: '$q2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + %2:fpr128 = COPY $q2 + %1:fpr128 = COPY $q1 + %0:fpr128 = COPY $q0 + %3:fpr128 = contract FMULv2f64 %1, %0 + %4:fpr128 = FADDv2f64 killed %3, %2 + $q0 = COPY %4 + RET_ReallyLR implicit $q0 + +... + +# Do create FMLA, because we have the contract flag on the FADD. +# +# CHECK-LABEL: name: vector_fmadd_contract_op1 +# CHECK: [[C:%.*]]:fpr128 = COPY $q2 +# CHECK-NEXT: [[B:%.*]]:fpr128 = COPY $q1 +# CHECK-NEXT: [[A:%.*]]:fpr128 = COPY $q0 +# CHECK-NEXT: fpr128 = FMLAv2f64 [[C]], [[B]], [[A]] + +--- +name: vector_fmadd_contract_op1 +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } +liveins: + - { reg: '$q0', virtual-reg: '%0' } + - { reg: '$q1', virtual-reg: '%1' } + - { reg: '$q2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + %2:fpr128 = COPY $q2 + %1:fpr128 = COPY $q1 + %0:fpr128 = COPY $q0 + %3:fpr128 = FMULv2f64 %1, %0 + %4:fpr128 = contract FADDv2f64 killed %3, %2 + $q0 = COPY %4 + RET_ReallyLR implicit $q0 + +... + +# Do not create FMLA, as nsz flag does not allow it. +# +# CHECK-LABEL: name: vector_fmadd_nsz +# CHECK: [[C:%.*]]:fpr128 = COPY $q2 +# CHECK-NEXT: [[B:%.*]]:fpr128 = COPY $q1 +# CHECK-NEXT: [[A:%.*]]:fpr128 = COPY $q0 +# CHECK-NEXT: [[MUL:%.*]]:fpr128 = nsz FMULv2f64 [[B]], [[A]] +# CHECK-NEXT: fpr128 = nsz FADDv2f64 killed [[MUL]], [[C]] +--- +name: vector_fmadd_nsz +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: fpr128 } + - { id: 1, class: fpr128 } + - { id: 2, class: fpr128 } + - { id: 3, class: fpr128 } + - { id: 4, class: fpr128 } +liveins: + - { reg: '$q0', virtual-reg: '%0' } + - { reg: '$q1', virtual-reg: '%1' } + - { reg: '$q2', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $q0, $q1, $q2 + + %2:fpr128 = COPY $q2 + %1:fpr128 = COPY $q1 + %0:fpr128 = COPY $q0 + %3:fpr128 = nsz FMULv2f64 %1, %0 + %4:fpr128 = nsz FADDv2f64 killed %3, %2 + $q0 = COPY %4 + RET_ReallyLR implicit $q0 + +... diff --git a/llvm/test/CodeGen/AArch64/neon-fma-FMF.ll b/llvm/test/CodeGen/AArch64/neon-fma-FMF.ll index 893d153801a71..0eb173396ce97 100644 --- a/llvm/test/CodeGen/AArch64/neon-fma-FMF.ll +++ b/llvm/test/CodeGen/AArch64/neon-fma-FMF.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s +; RUN: llc < %s -O3 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s define <2 x float> @fma_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) { ; CHECK-LABEL: fma_1: diff --git a/llvm/test/CodeGen/AArch64/rmif-def-nzcv.mir b/llvm/test/CodeGen/AArch64/rmif-def-nzcv.mir new file mode 100644 index 0000000000000..1d0903d1af8a7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rmif-def-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:49: missing implicit register operand 'implicit $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$x0' } +body: | + bb.0: + liveins: $x0 + + RMIF renamable $x0, 0, 0, implicit-def $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/rmif-use-nzcv.mir b/llvm/test/CodeGen/AArch64/rmif-use-nzcv.mir new file mode 100644 index 0000000000000..74274a2acb2e6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rmif-use-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:45: missing implicit register operand 'implicit-def $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$x0' } +body: | + bb.0: + liveins: $x0 + + RMIF renamable $x0, 0, 0, implicit $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/setf16-def-nzcv.mir b/llvm/test/CodeGen/AArch64/setf16-def-nzcv.mir new file mode 100644 index 0000000000000..e828ddad68e19 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setf16-def-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:45: missing implicit register operand 'implicit $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } +body: | + bb.0: + liveins: $w0 + + SETF16 renamable $w0, implicit-def $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/setf16-use-nzcv.mir b/llvm/test/CodeGen/AArch64/setf16-use-nzcv.mir new file mode 100644 index 0000000000000..7f2c5606f2e8f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setf16-use-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:41: missing implicit register operand 'implicit-def $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } +body: | + bb.0: + liveins: $w0 + + SETF16 renamable $w0, implicit $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/setf8-def-nzcv.mir b/llvm/test/CodeGen/AArch64/setf8-def-nzcv.mir new file mode 100644 index 0000000000000..d9a9ef00efd59 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setf8-def-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:44: missing implicit register operand 'implicit $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } +body: | + bb.0: + liveins: $w0 + + SETF8 renamable $w0, implicit-def $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/setf8-use-nzcv.mir b/llvm/test/CodeGen/AArch64/setf8-use-nzcv.mir new file mode 100644 index 0000000000000..05d803f7b7b61 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setf8-use-nzcv.mir @@ -0,0 +1,16 @@ +# RUN: not llc -o - %s -mtriple=arm64-eabi -run-pass=legalizer 2>&1 | FileCheck %s + +# CHECK: [[@LINE+10]]:40: missing implicit register operand 'implicit-def $nzcv' +... +--- +name: test_flags +liveins: + - { reg: '$w0' } +body: | + bb.0: + liveins: $w0 + + SETF8 renamable $w0, implicit $nzcv + RET undef $lr, implicit killed $w0 + + diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll b/llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll index 9ca4ff59b24f2..1525b2d3e3531 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-dbg.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -stack-tagging -S -o - | FileCheck %s +; RUN: opt < %s -aarch64-stack-tagging -S -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll index 258586824530c..2099e722fe523 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -stack-tagging %s -o - | FileCheck %s +; RUN: opt -S -aarch64-stack-tagging %s -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-arm-unknown-eabi" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll index 820041e061a3c..11389d5d5ba7c 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll @@ -24,7 +24,7 @@ ; return 0; ; } -; RUN: opt -S -stack-tagging %s -o - | FileCheck %s +; RUN: opt -S -aarch64-stack-tagging %s -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-unknown-eabi" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll b/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll index 5c4be4354ea7b..9dc08c192a01b 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -stack-tagging -S -o - | FileCheck %s +; RUN: opt < %s -aarch64-stack-tagging -S -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll index 3c0dc3df98dbd..a73c79d6cc985 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll @@ -1,4 +1,4 @@ -;; RUN: opt -S -stack-tagging %s -o - | FileCheck %s +;; RUN: opt -S -aarch64-stack-tagging %s -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-arm-unknown-eabi" diff --git a/llvm/test/CodeGen/AArch64/stack-tagging.ll b/llvm/test/CodeGen/AArch64/stack-tagging.ll index 20d3c6bebdec4..275b8a7dbad7e 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -stack-tagging -S -o - | FileCheck %s --check-prefixes=CHECK,SSI -; RUN: opt < %s -stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK,NOSSI +; RUN: opt < %s -aarch64-stack-tagging -S -o - | FileCheck %s --check-prefixes=CHECK,SSI +; RUN: opt < %s -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 -S -o - | FileCheck %s --check-prefixes=CHECK,NOSSI target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android" diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll index 86fff734f1883..3bb3627e23939 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll @@ -257,3 +257,58 @@ define @oeq_4f32_zext( %x, %y to ret %r } + +define @eq_fast( %x, %x2) { +; CHECK-LABEL: eq_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret + %y = fcmp fast oeq %x, %x2 + ret %y +} +define @gt_fast( %x, %x2) { +; CHECK-LABEL: gt_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmgt p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret + %y = fcmp fast ogt %x, %x2 + ret %y +} +define @ge_fast( %x, %x2) { +; CHECK-LABEL: ge_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmge p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret + %y = fcmp fast oge %x, %x2 + ret %y +} +define @lt_fast( %x, %x2) { +; CHECK-LABEL: lt_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmgt p0.s, p0/z, z1.s, z0.s +; CHECK-NEXT: ret + %y = fcmp fast olt %x, %x2 + ret %y +} +define @le_fast( %x, %x2) { +; CHECK-LABEL: le_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmge p0.s, p0/z, z1.s, z0.s +; CHECK-NEXT: ret + %y = fcmp fast ole %x, %x2 + ret %y +} +define @ne_fast( %x, %x2) { +; CHECK-LABEL: ne_fast: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ret + %y = fcmp fast one %x, %x2 + ret %y +} diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll index 191fddacffd1d..96c754778d52e 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck --check-prefix OPT %s +; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck --check-prefix OPT %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll index 723ffd8c17330..47e0ff8f19c7f 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck --check-prefix OPT %s +; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck --check-prefix OPT %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll index 8a5d669e4241e..fa67d92c2ae0e 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll @@ -325,6 +325,39 @@ define @uqsub_i64( %a, % ret %out } +; ADD (tuples) + +define @add_i64_tuple2(* %out, %in1, %in2) { +; CHECK-LABEL: add_i64_tuple2 +; CHECK: add z0.d, z0.d, z0.d +; CHECK: add z1.d, z1.d, z1.d + %tuple = tail call @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64( %in1, %in2) + %res = add %tuple, %tuple + ret %res +} + +define @add_i64_tuple3(* %out, %in1, %in2, %in3) { +; CHECK-LABEL: add_i64_tuple3 +; CHECK: add z0.d, z0.d, z0.d +; CHECK: add z1.d, z1.d, z1.d +; CHECK: add z2.d, z2.d, z2.d + %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64( %in1, %in2, %in3) + %res = add %tuple, %tuple + ret %res +} + +define @add_i64_tuple4(* %out, %in1, %in2, %in3, %in4) { +; CHECK-LABEL: add_i64_tuple4 +; CHECK: add z0.d, z0.d, z0.d +; CHECK: add z1.d, z1.d, z1.d +; CHECK: add z2.d, z2.d, z2.d +; CHECK: add z3.d, z3.d, z3.d + %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64( %in1, %in2, %in3, %in4) + %res = add %tuple, %tuple + ret %res +} + + declare @llvm.aarch64.sve.abs.nxv16i8(, , ) declare @llvm.aarch64.sve.abs.nxv8i16(, , ) declare @llvm.aarch64.sve.abs.nxv4i32(, , ) @@ -366,3 +399,7 @@ declare @llvm.aarch64.sve.uqsub.x.nxv16i8(, declare @llvm.aarch64.sve.uqsub.x.nxv8i16(, ) declare @llvm.aarch64.sve.uqsub.x.nxv4i32(, ) declare @llvm.aarch64.sve.uqsub.x.nxv2i64(, ) + +declare @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(, ) +declare @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(, , ) +declare @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(, , , ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll new file mode 100644 index 0000000000000..e7edfc9d6bdd6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll @@ -0,0 +1,174 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+f64mm -asm-verbose=0 < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; WARN-NOT: warning + +; +; LD1ROB +; + +define @ld1rob_i8( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8: +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, #32] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; +; LD1ROH +; + +define @ld1roh_i16( %pg, i16* %a) nounwind { +; CHECK-LABEL: ld1roh_i16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr i16, i16* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8i16( %pg, i16* %base) + ret %load +} + +define @ld1roh_f16( %pg, half* %a) nounwind { +; CHECK-LABEL: ld1roh_f16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr half, half* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8f16( %pg, half* %base) + ret %load +} + +define @ld1roh_bf16( %pg, bfloat* %a) nounwind #0 { +; CHECK-LABEL: ld1roh_bf16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr bfloat, bfloat* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8bf16( %pg, bfloat* %base) + ret %load +} + +; +; LD1ROW +; + +define @ld1row_i32( %pg, i32* %a) nounwind { +; CHECK-LABEL: ld1row_i32: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0, #128] +; CHECK-NEXT: ret + %base = getelementptr i32, i32* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv4i32( %pg, i32* %base) + ret %load +} + +define @ld1row_f32( %pg, float* %a) nounwind { +; CHECK-LABEL: ld1row_f32: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0, #128] +; CHECK-NEXT: ret + %base = getelementptr float, float* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv4f32( %pg, float* %base) + ret %load +} + +; +; LD1ROD +; + +define @ld1rod_i64( %pg, i64* %a) nounwind { +; CHECK-LABEL: ld1rod_i64: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #-64] +; CHECK-NEXT: ret + %base = getelementptr i64, i64* %a, i64 -8 + %load = call @llvm.aarch64.sve.ld1ro.nxv2i64( %pg, i64* %base) + ret %load +} + +define @ld1rod_f64( %pg, double* %a) nounwind { +; CHECK-LABEL: ld1rod_f64: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #-128] +; CHECK-NEXT: ret + %base = getelementptr double, double* %a, i64 -16 + %load = call @llvm.aarch64.sve.ld1ro.nxv2f64( %pg, double* %base) + ret %load +} + + +;;;;;;;;;;;;;; +; range checks: immediate must be a multiple of 32 in the range -256, ..., 224 + +; lower bound +define @ld1rob_i8_lower_bound( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_lower_bound: +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, #-256] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 -256 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; below lower bound +define @ld1roh_i16_below_lower_bound( %pg, i16* %a) nounwind { +; CHECK-LABEL: ld1roh_i16_below_lower_bound: +; CHECK-NEXT: sub x[[BASE:[0-9]+]], x0, #258 +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x[[BASE]]] +; CHECK-NEXT: ret + %base = getelementptr i16, i16* %a, i64 -129 + %load = call @llvm.aarch64.sve.ld1ro.nxv8i16( %pg, i16* %base) + ret %load +} + +define @ld1rob_i8_below_lower_bound_01( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_below_lower_bound_01: +; CHECK-NEXT: mov x[[OFFSET:[0-9]+]], #-257 +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 -257 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; not a multiple of 32 +define @ld1row_i32_not_multiple( %pg, i32* %a) nounwind { +; CHECK-LABEL: ld1row_i32_not_multiple: +; CHECK-NEXT: add x[[BASE:[0-9]+]], x0, #12 +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x[[BASE]]] +; CHECK-NEXT: ret + %base = getelementptr i32, i32* %a, i64 3 + %load = call @llvm.aarch64.sve.ld1ro.nxv4i32( %pg, i32* %base) + ret %load +} + +; upper bound +define @ld1rod_i64_upper_bound( %pg, i64* %a) nounwind { +; CHECK-LABEL: ld1rod_i64_upper_bound: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #224] +; CHECK-NEXT: ret + %base = getelementptr i64, i64* %a, i64 28 + %load = call @llvm.aarch64.sve.ld1ro.nxv2i64( %pg, i64* %base) + ret %load +} + +define @ld1rob_i8_beyond_upper_bound( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_beyond_upper_bound: +; CHECK-NEXT: mov w[[OFFSET:[0-9]+]], #225 +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 225 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +declare @llvm.aarch64.sve.ld1ro.nxv16i8(, i8*) + +declare @llvm.aarch64.sve.ld1ro.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1ro.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1ro.nxv8bf16(, bfloat*) + +declare @llvm.aarch64.sve.ld1ro.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1ro.nxv4f32(, float*) + +declare @llvm.aarch64.sve.ld1ro.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ld1ro.nxv2f64(, double*) + + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+f64mm,+bf16" } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll new file mode 100644 index 0000000000000..1ffa78ec27352 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll @@ -0,0 +1,495 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s + +; NOTE: invalid, upper and lower bound immediate values of the regimm +; addressing mode are checked only for the byte version of each +; instruction (`ldb`), as the code for detecting the immediate is +; common to all instructions, and varies only for the number of +; elements of the structure store, which is = 2, 3, 4. + +; ld2b +define @ld2.nxv32i8( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 2 +%base_ptr = bitcast * %base to i8* +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_lower_bound: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -16 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_upper_bound: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 14 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_not_multiple_of_2( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_not_multiple_of_2: +; CHECK: rdvl x[[OFFSET:[0-9]]], #3 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 3 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_outside_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_outside_lower_bound: +; CHECK: rdvl x[[OFFSET:[0-9]]], #-18 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -18 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld2.nxv32i8_outside_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld2.nxv32i8_outside_upper_bound: +; CHECK: rdvl x[[OFFSET:[0-9]]], #16 +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 16 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +; ld2h +define @ld2.nxv16i16( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv16i16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 14 +%base_ptr = bitcast * %base to i16 * +%res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) +ret %res +} + +define @ld2.nxv16f16( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv16f16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -16 +%base_ptr = bitcast * %base to half * +%res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %Pg, half *%base_ptr) +ret %res +} + +define @ld2.nxv16bf16( %Pg, * %addr) #0 { +; CHECK-LABEL: ld2.nxv16bf16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #12, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 12 +%base_ptr = bitcast * %base to bfloat * +%res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) +ret %res +} + +; ld2w +define @ld2.nxv8i32( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv8i32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 14 +%base_ptr = bitcast * %base to i32 * +%res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) +ret %res +} + +define @ld2.nxv8f32( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv8f32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -16 +%base_ptr = bitcast * %base to float * +%res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %Pg, float *%base_ptr) +ret %res +} + +; ld2d +define @ld2.nxv4i64( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv4i64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, #14, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 14 +%base_ptr = bitcast * %base to i64 * +%res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) +ret %res +} + +define @ld2.nxv4f64( %Pg, * %addr) { +; CHECK-LABEL: ld2.nxv4f64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, #-16, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -16 +%base_ptr = bitcast * %base to double * +%res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %Pg, double *%base_ptr) +ret %res +} + +; ld3b +define @ld3.nxv48i8( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 3 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_lower_bound: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -24 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_upper_bound: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_not_multiple_of_3_01( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_01: +; CHECK: rdvl x[[OFFSET:[0-9]]], #4 +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 4 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_not_multiple_of_3_02( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_02: +; CHECK: rdvl x[[OFFSET:[0-9]]], #5 +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 5 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_outside_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_outside_lower_bound: +; CHECK: rdvl x[[OFFSET:[0-9]]], #-27 +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -27 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld3.nxv48i8_outside_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv48i8_outside_upper_bound: +; CHECK: rdvl x[[OFFSET:[0-9]]], #24 +; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 24 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +; ld3h +define @ld3.nxv24i16( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv24i16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to i16 * +%res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) +ret %res +} + +define @ld3.nxv24f16( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv24f16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to half * +%res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %Pg, half *%base_ptr) +ret %res +} + +define @ld3.nxv24bf16( %Pg, *%addr) #0 { +; CHECK-LABEL: ld3.nxv24bf16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -24 +%base_ptr = bitcast * %base to bfloat * +%res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) +ret %res +} + +; ld3w +define @ld3.nxv12i32( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv12i32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to i32 * +%res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) +ret %res +} + +define @ld3.nxv12f32( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv12f32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -24 +%base_ptr = bitcast * %base to float * +%res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %Pg, float *%base_ptr) +ret %res +} + +; ld3d +define @ld3.nxv6i64( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv6i64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #21, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 21 +%base_ptr = bitcast * %base to i64 * +%res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) +ret %res +} + +define @ld3.nxv6f64( %Pg, *%addr) { +; CHECK-LABEL: ld3.nxv6f64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #-24, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -24 +%base_ptr = bitcast * %base to double * +%res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %Pg, double *%base_ptr) +ret %res +} + +; ; ld4b +define @ld4.nxv64i8( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 4 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_lower_bound: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -32 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_upper_bound: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 28 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_not_multiple_of_4_01( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_01: +; CHECK: rdvl x[[OFFSET:[0-9]]], #5 +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 5 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_not_multiple_of_4_02( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_02: +; CHECK: rdvl x[[OFFSET:[0-9]]], #6 +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 6 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_not_multiple_of_4_03( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_03: +; CHECK: rdvl x[[OFFSET:[0-9]]], #7 +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 7 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_outside_lower_bound( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_outside_lower_bound: +; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #4) #9) +; xM = -9 * 2^6 +; xP = RDVL * 2^-4 +; xOFFSET = RDVL * 2^-4 * -9 * 2^6 = RDVL * -36 +; CHECK: rdvl x[[N:[0-9]]], #1 +; CHECK-DAG: mov x[[M:[0-9]]], #-576 +; CHECK-DAG: lsr x[[P:[0-9]]], x[[N]], #4 +; CHECK-DAG: mul x[[OFFSET:[0-9]]], x[[P]], x[[M]] +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -36 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +define @ld4.nxv64i8_outside_upper_bound( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv64i8_outside_upper_bound: +; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #16) #2) +; xM = 2^9 +; xP = RDVL * 2^-4 +; xOFFSET = RDVL * 2^-4 * 2^9 = RDVL * 32 +; CHECK: rdvl x[[N:[0-9]]], #1 +; CHECK-DAG: mov w[[M:[0-9]]], #512 +; CHECK-DAG: lsr x[[P:[0-9]]], x[[N]], #4 +; CHECK-DAG: mul x[[OFFSET:[0-9]]], x[[P]], x[[M]] +; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 32 +%base_ptr = bitcast * %base to i8 * +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%base_ptr) +ret %res +} + +; ld4h +define @ld4.nxv32i16( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv32i16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #8, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 8 +%base_ptr = bitcast * %base to i16 * +%res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %Pg, i16 *%base_ptr) +ret %res +} + +define @ld4.nxv32f16( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv32f16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 28 +%base_ptr = bitcast * %base to half * +%res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %Pg, half *%base_ptr) +ret %res +} + +define @ld4.nxv32bf16( %Pg, *%addr) #0 { +; CHECK-LABEL: ld4.nxv32bf16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -32 +%base_ptr = bitcast * %base to bfloat * +%res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %Pg, bfloat *%base_ptr) +ret %res +} + +; ld4w +define @ld4.nxv16i32( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv16i32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 28 +%base_ptr = bitcast * %base to i32 * +%res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %Pg, i32 *%base_ptr) +ret %res +} + +define @ld4.nxv16f32( %Pg, * %addr) { +; CHECK-LABEL: ld4.nxv16f32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -32 +%base_ptr = bitcast * %base to float * +%res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %Pg, float *%base_ptr) +ret %res +} + +; ld4d +define @ld4.nxv8i64( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv8i64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #28, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 28 +%base_ptr = bitcast * %base to i64 * +%res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %Pg, i64 *%base_ptr) +ret %res +} + +define @ld4.nxv8f64( %Pg, *%addr) { +; CHECK-LABEL: ld4.nxv8f64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #-32, mul vl] +; CHECK-NEXT: ret +%base = getelementptr , * %addr, i64 -32 +%base_ptr = bitcast * %base to double * +%res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %Pg, double * %base_ptr) +ret %res +} + +declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(, double*) + +declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(, double*) + +declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll new file mode 100644 index 0000000000000..ab59c84137958 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll @@ -0,0 +1,259 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s + +; ld2b +define @ld2.nxv32i8( %Pg, i8 *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv32i8: +; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret +%addr2 = getelementptr i8, i8 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8( %Pg, i8 *%addr2) +ret %res +} + +; ld2h +define @ld2.nxv16i16( %Pg, i16 *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv16i16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr i16, i16 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16( %Pg, i16 *%addr2) +ret %res +} + +define @ld2.nxv16f16( %Pg, half *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv16f16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr half, half * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16( %Pg, half *%addr2) +ret %res +} + +define @ld2.nxv16bf16( %Pg, bfloat *%addr, i64 %a) #0 { +; CHECK-LABEL: ld2.nxv16bf16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) +ret %res +} + +; ld2w +define @ld2.nxv8i32( %Pg, i32 *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv8i32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr i32, i32 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32( %Pg, i32 *%addr2) +ret %res +} + +define @ld2.nxv8f32( %Pg, float *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv8f32: +; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr float, float * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32( %Pg, float *%addr2) +ret %res +} + +; ld2d +define @ld2.nxv4i64( %Pg, i64 *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv4i64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr i64, i64 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64( %Pg, i64 *%addr2) +ret %res +} + +define @ld2.nxv4f64( %Pg, double *%addr, i64 %a) { +; CHECK-LABEL: ld2.nxv4f64: +; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr double, double * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64( %Pg, double *%addr2) +ret %res +} + +; ld3b +define @ld3.nxv48i8( %Pg, i8 *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv48i8: +; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret +%addr2 = getelementptr i8, i8 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8( %Pg, i8 *%addr2) +ret %res +} + +; ld3h +define @ld3.nxv24i16( %Pg, i16 *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv24i16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr i16, i16 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16( %Pg, i16 *%addr2) +ret %res +} + +define @ld3.nxv24f16( %Pg, half *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv24f16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr half, half * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16( %Pg, half *%addr2) +ret %res +} + +define @ld3.nxv24bf16( %Pg, bfloat *%addr, i64 %a) #0 { +; CHECK-LABEL: ld3.nxv24bf16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) +ret %res +} + +; ld3w +define @ld3.nxv12i32( %Pg, i32 *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv12i32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr i32, i32 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32( %Pg, i32 *%addr2) +ret %res +} + +define @ld3.nxv12f32( %Pg, float *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv12f32: +; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr float, float * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32( %Pg, float *%addr2) +ret %res +} + +; ld3d +define @ld3.nxv6i64( %Pg, i64 *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv6i64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr i64, i64 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64( %Pg, i64 *%addr2) +ret %res +} + +define @ld3.nxv6f64( %Pg, double *%addr, i64 %a) { +; CHECK-LABEL: ld3.nxv6f64: +; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr double, double * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64( %Pg, double *%addr2) +ret %res +} + +; ld4b +define @ld4.nxv64i8( %Pg, i8 *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv64i8: +; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x1] +; CHECK-NEXT: ret +%addr2 = getelementptr i8, i8 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8( %Pg, i8 *%addr2) +ret %res +} + +; ld4h +define @ld4.nxv32i16( %Pg, i16 *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv32i16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr i16, i16 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16( %Pg, i16 *%addr2) +ret %res +} + +define @ld4.nxv32f16( %Pg, half *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv32f16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr half, half * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16( %Pg, half *%addr2) +ret %res +} + +define @ld4.nxv32bf16( %Pg, bfloat *%addr, i64 %a) #0 { +; CHECK-LABEL: ld4.nxv32bf16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret +%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %Pg, bfloat *%addr2) +ret %res +} + +; ld4w +define @ld4.nxv16i32( %Pg, i32 *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv16i32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr i32, i32 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32( %Pg, i32 *%addr2) +ret %res +} + +define @ld4.nxv16f32( %Pg, float *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv16f32: +; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret +%addr2 = getelementptr float, float * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32( %Pg, float *%addr2) +ret %res +} + +; ld4d +define @ld4.nxv8i64( %Pg, i64 *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv8i64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr i64, i64 * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64( %Pg, i64 *%addr2) +ret %res +} + +define @ld4.nxv8f64( %Pg, double *%addr, i64 %a) { +; CHECK-LABEL: ld4.nxv8f64: +; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret +%addr2 = getelementptr double, double * %addr, i64 %a +%res = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64( %Pg, double *%addr2) +ret %res +} + +declare @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(, double*) + +declare @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(, double*) + +declare @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(, i8*) +declare @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(, i16*) +declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(, i32*) +declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(, i64*) +declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) +declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) +declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir new file mode 100644 index 0000000000000..c20846c54b6a2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir @@ -0,0 +1,61 @@ +# RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -run-pass=localstackalloc -o - %s | FileCheck %s + +--- | + ; ModuleID = '' + source_filename = "" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-unknown-linux-gnu" + + define @insert_32i8_idx( %a, i8 %elt, i64 %idx) #0 { + %ins = insertelement %a, i8 %elt, i64 %idx + ret %ins + } + + attributes #0 = { "target-features"="+sve" } + +... +--- +name: insert_32i8_idx +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: zpr, preferred-register: '' } + - { id: 1, class: zpr, preferred-register: '' } + - { id: 2, class: gpr32, preferred-register: '' } + - { id: 3, class: gpr64, preferred-register: '' } + - { id: 5, class: ppr_3b, preferred-register: '' } + - { id: 6, class: gpr64sp, preferred-register: '' } + - { id: 7, class: zpr, preferred-register: '' } + - { id: 8, class: zpr, preferred-register: '' } +liveins: + - { reg: '$z0', virtual-reg: '%0' } + - { reg: '$z1', virtual-reg: '%1' } + - { reg: '$w0', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +# CHECK-LABEL: name: insert_32i8_idx +# CHECK: localFrameSize: 0 +stack: + - { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16, + stack-id: sve-vec, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: {} +body: | + bb.0 (%ir-block.0): + liveins: $z0, $z1, $w0 + + %2:gpr32 = COPY $w0 + %1:zpr = COPY $z1 + %0:zpr = COPY $z0 + %5:ppr_3b = PTRUE_B 31 + %6:gpr64sp = ADDXri %stack.0, 0, 0 + ST1B_IMM %1, %5, %6, 1 :: (store unknown-size, align 16) + ST1B_IMM %0, %5, %stack.0, 0 :: (store unknown-size into %stack.0, align 16) + %7:zpr = LD1B_IMM %5, %6, 1 :: (load unknown-size from %stack.0 + 16, align 16) + %8:zpr = LD1B_IMM %5, %stack.0, 0 :: (load unknown-size from %stack.0, align 16) + $z0 = COPY %8 + $z1 = COPY %7 + RET_ReallyLR implicit $z0, implicit $z1 + +... diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll index f5047a7bcbaff..5e48ad8b628e0 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll @@ -188,6 +188,94 @@ define void @masked_store_nxv8bf16( *%a, @masked.load.nxv2p0i8(* %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.load.nxv2p0i8: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %v = call @llvm.masked.load.nxv2p0i8.p0nxv2p0i8(* %vector_ptr, i32 8, %mask, undef) + ret %v +} +define @masked.load.nxv2p0i16(* %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.load.nxv2p0i16: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %v = call @llvm.masked.load.nxv2p0i16.p0nxv2p0i16(* %vector_ptr, i32 8, %mask, undef) + ret %v +} +define @masked.load.nxv2p0i32(* %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.load.nxv2p0i32: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %v = call @llvm.masked.load.nxv2p0i32.p0nxv2p0i32(* %vector_ptr, i32 8, %mask, undef) + ret %v +} +define @masked.load.nxv2p0i64(* %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.load.nxv2p0i64: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %v = call @llvm.masked.load.nxv2p0i64.p0nxv2p0i64(* %vector_ptr, i32 8, %mask, undef) + ret %v +} + +; Pointer of floating-point type + +define @masked.load.nxv2p0bf16(* %vector_ptr, %mask) nounwind #0 { +; CHECK-LABEL: masked.load.nxv2p0bf16: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %v = call @llvm.masked.load.nxv2p0bf16.p0nxv2p0bf16(* %vector_ptr, i32 8, %mask, undef) + ret %v +} +define @masked.load.nxv2p0f16(* %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.load.nxv2p0f16: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %v = call @llvm.masked.load.nxv2p0f16.p0nxv2p0f16(* %vector_ptr, i32 8, %mask, undef) + ret %v +} +define @masked.load.nxv2p0f32(* %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.load.nxv2p0f32: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %v = call @llvm.masked.load.nxv2p0f32.p0nxv2p0f32(* %vector_ptr, i32 8, %mask, undef) + ret %v +} +define @masked.load.nxv2p0f64(* %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.load.nxv2p0f64: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %v = call @llvm.masked.load.nxv2p0f64.p0nxv2p0f64(* %vector_ptr, i32 8, %mask, undef) + ret %v +} + +; Pointer of array type + +define void @masked.store.nxv2p0a64i16( %data, * %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.store.nxv2p0a64i16: +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv2p0a64i16.p0nxv2p0a64i16( %data, * %vector_ptr, i32 8, %mask) + ret void +} + +; Pointer of struct type + +%struct = type { i8*, i32 } +define void @masked.store.nxv2p0s_struct( %data, * %vector_ptr, %mask) nounwind { +; CHECK-LABEL: masked.store.nxv2p0s_struct: +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.masked.store.nxv2p0s_struct.p0nxv2p0s_struct( %data, * %vector_ptr, i32 8, %mask) + ret void +} + + declare @llvm.masked.load.nxv2i64(*, i32, , ) declare @llvm.masked.load.nxv4i32(*, i32, , ) declare @llvm.masked.load.nxv8i16(*, i32, , ) @@ -214,5 +302,19 @@ declare void @llvm.masked.store.nxv4f16(, declare void @llvm.masked.store.nxv8f16(, *, i32, ) declare void @llvm.masked.store.nxv8bf16(, *, i32, ) +declare @llvm.masked.load.nxv2p0i8.p0nxv2p0i8(*, i32 immarg, , ) +declare @llvm.masked.load.nxv2p0i16.p0nxv2p0i16(*, i32 immarg, , ) +declare @llvm.masked.load.nxv2p0i32.p0nxv2p0i32(*, i32 immarg, , ) +declare @llvm.masked.load.nxv2p0i64.p0nxv2p0i64(*, i32 immarg, , ) + +declare @llvm.masked.load.nxv2p0bf16.p0nxv2p0bf16(*, i32 immarg, , ) +declare @llvm.masked.load.nxv2p0f16.p0nxv2p0f16(*, i32 immarg, , ) +declare @llvm.masked.load.nxv2p0f32.p0nxv2p0f32(*, i32 immarg, , ) +declare @llvm.masked.load.nxv2p0f64.p0nxv2p0f64(*, i32 immarg, , ) + +declare void @llvm.masked.store.nxv2p0a64i16.p0nxv2p0a64i16(, *, i32 immarg, ) + +declare void @llvm.masked.store.nxv2p0s_struct.p0nxv2p0s_struct(, *, i32 immarg, ) + ; +bf16 is required for the bfloat version. attributes #0 = { "target-features"="+sve,+bf16" } diff --git a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll index 24cf433306bb5..1275811d175e1 100644 --- a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll @@ -1,5 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; WARN-NOT: warning define @sext_i1_i8( %a) { ; CHECK-LABEL: sext_i1_i8: diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll index 46d152bbf7acb..7c0e9e9f4d9b3 100644 --- a/llvm/test/CodeGen/AArch64/sve-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll @@ -116,8 +116,7 @@ define @trunc_i64toi1_split3( %in) { ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset p4, -16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z7.d, z7.d, #0x1 diff --git a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll index 7a53e5879c908..043e4ea135f34 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll @@ -134,6 +134,19 @@ define @sve_splat_2xi32(i32 %val) { ret %splat } +;; Widen/split splats of wide vector types. + +define @sve_splat_12xi32(i32 %val) { +; CHECK-LABEL: @sve_splat_12xi32 +; CHECK: mov z0.s, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: ret + %ins = insertelement undef, i32 %val, i32 0 + %splat = shufflevector %ins, undef, zeroinitializer + ret %splat +} + define @sve_splat_2xi1(i1 %val) { ; CHECK-LABEL: @sve_splat_2xi1 ; CHECK: sbfx x8, x0, #0, #1 diff --git a/llvm/test/CodeGen/AArch64/tagged-globals.ll b/llvm/test/CodeGen/AArch64/tagged-globals.ll index b0cf882584c6f..cdca6c7a1fba1 100644 --- a/llvm/test/CodeGen/AArch64/tagged-globals.ll +++ b/llvm/test/CodeGen/AArch64/tagged-globals.ll @@ -1,31 +1,105 @@ -; RUN: llc < %s | FileCheck %s +; RUN: llc --relocation-model=static < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK-STATIC,CHECK-SELECTIONDAGISEL +; RUN: llc --relocation-model=pic < %s \ +; RUN: | FileCheck %s --check-prefix=CHECK-PIC + +; Ensure that GlobalISel lowers correctly. GlobalISel is the default ISel for +; -O0 on aarch64. GlobalISel lowers the instruction sequence in the static +; relocation model different to SelectionDAGISel. GlobalISel does the lowering +; of AddLow *after* legalization, and thus doesn't differentiate between +; address-taken-only vs. address-taken-for-loadstore. Hence, we generate a movk +; instruction for load/store instructions as well with GlobalISel. GlobalISel +; also doesn't have the scaffolding to correctly check the bounds of the global +; offset, and cannot fold the lo12 bits into the load/store. Neither of these +; things are a problem as GlobalISel is only used by default at -O0, so we don't +; mind the code size and performance increase. + +; RUN: llc --aarch64-enable-global-isel-at-O=0 -O0 < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK-STATIC,CHECK-GLOBALISEL +; RUN: llc --aarch64-enable-global-isel-at-O=0 -O0 --relocation-model=pic < %s \ +; RUN: | FileCheck %s --check-prefix=CHECK-PIC target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-android" -@global = external hidden global i32 +@global = external global i32 declare void @func() define i32* @global_addr() #0 { - ; CHECK: global_addr: - ; CHECK: adrp x0, :pg_hi21_nc:global - ; CHECK: movk x0, #:prel_g3:global+4294967296 - ; CHECK: add x0, x0, :lo12:global + ; Static relocation model has common codegen between SelectionDAGISel and + ; GlobalISel when the address-taken of a global isn't folded into a load or + ; store instruction. + ; CHECK-STATIC: global_addr: + ; CHECK-STATIC: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global + ; CHECK-STATIC: movk [[REG]], #:prel_g3:global+4294967296 + ; CHECK-STATIC: add x0, [[REG]], :lo12:global + ; CHECK-STATIC: ret + + ; CHECK-PIC: global_addr: + ; CHECK-PIC: adrp [[REG:x[0-9]+]], :got:global + ; CHECK-PIC: ldr x0, {{\[}}[[REG]], :got_lo12:global] + ; CHECK-PIC: ret + ret i32* @global } define i32 @global_load() #0 { - ; CHECK: global_load: - ; CHECK: adrp x8, :pg_hi21_nc:global - ; CHECK: ldr w0, [x8, :lo12:global] + ; CHECK-SELECTIONDAGISEL: global_load: + ; CHECK-SELECTIONDAGISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global + ; CHECK-SELECTIONDAGISEL: ldr w0, {{\[}}[[REG]], :lo12:global{{\]}} + ; CHECK-SELECTIONDAGISEL: ret + + ; CHECK-GLOBALISEL: global_load: + ; CHECK-GLOBALISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global + ; CHECK-GLOBALISEL: movk [[REG]], #:prel_g3:global+4294967296 + ; CHECK-GLOBALISEL: add [[REG]], [[REG]], :lo12:global + ; CHECK-GLOBALISEL: ldr w0, {{\[}}[[REG]]{{\]}} + ; CHECK-GLOBALISEL: ret + + ; CHECK-PIC: global_load: + ; CHECK-PIC: adrp [[REG:x[0-9]+]], :got:global + ; CHECK-PIC: ldr [[REG]], {{\[}}[[REG]], :got_lo12:global] + ; CHECK-PIC: ldr w0, {{\[}}[[REG]]{{\]}} + ; CHECK-PIC: ret + %load = load i32, i32* @global ret i32 %load } +define void @global_store() #0 { + ; CHECK-SELECTIONDAGISEL: global_store: + ; CHECK-SELECTIONDAGISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global + ; CHECK-SELECTIONDAGISEL: str wzr, {{\[}}[[REG]], :lo12:global{{\]}} + ; CHECK-SELECTIONDAGISEL: ret + + ; CHECK-GLOBALISEL: global_store: + ; CHECK-GLOBALISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global + ; CHECK-GLOBALISEL: movk [[REG]], #:prel_g3:global+4294967296 + ; CHECK-GLOBALISEL: add [[REG]], [[REG]], :lo12:global + ; CHECK-GLOBALISEL: str wzr, {{\[}}[[REG]]{{\]}} + ; CHECK-GLOBALISEL: ret + + ; CHECK-PIC: global_store: + ; CHECK-PIC: adrp [[REG:x[0-9]+]], :got:global + ; CHECK-PIC: ldr [[REG]], {{\[}}[[REG]], :got_lo12:global] + ; CHECK-PIC: str wzr, {{\[}}[[REG]]{{\]}} + ; CHECK-PIC: ret + + store i32 0, i32* @global + ret void +} + define void ()* @func_addr() #0 { - ; CHECK: func_addr: - ; CHECK: adrp x0, func - ; CHECK: add x0, x0, :lo12:func + ; CHECK-STATIC: func_addr: + ; CHECK-STATIC: adrp [[REG:x[0-9]+]], func + ; CHECK-STATIC: add x0, [[REG]], :lo12:func + ; CHECK-STATIC: ret + + ; CHECK-PIC: func_addr: + ; CHECK-PIC: adrp [[REG:x[0-9]+]], :got:func + ; CHECK-PIC: ldr x0, {{\[}}[[REG]], :got_lo12:func] + ; CHECK-PIC: ret + ret void ()* @func } diff --git a/llvm/test/CodeGen/AArch64/vector-gep.ll b/llvm/test/CodeGen/AArch64/vector-gep.ll new file mode 100644 index 0000000000000..c69405524bcd3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-gep.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=arm64_32-apple-watchos2.0.0 --aarch64-neon-syntax=generic | FileCheck %s + +target datalayout = "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128" +target triple = "arm64_32-apple-watchos2.0.0" + +; CHECK-LABEL: lCPI0_0: +; CHECK-NEXT: .quad 36 +; CHECK-NEXT: .quad 4804 + +define <2 x i8*> @vector_gep(<2 x i8*> %0) { +; CHECK-LABEL: vector_gep: +; CHECK: adrp x[[REG8:[123]?[0-9]]], lCPI0_0@PAGE +; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG8]], lCPI0_0@PAGEOFF] +; CHECK: add v[[REG0:[0-9]+]].2d, v[[REG0]].2d, v[[REG1]].2d +; CHECK: movi v[[REG1]].2d, #0x000000ffffffff +; CHECK: and v[[REG0]].16b, v[[REG0]].16b, v[[REG1]].16b +; CHECK: ret +entry: + %1 = getelementptr i8, <2 x i8*> %0, <2 x i32> + ret <2 x i8*> %1 +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index ccf846a933abd..2205bfe3c71d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -91,15 +91,14 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xffffffc0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffc0 +; GFX8-NEXT: s_movk_i32 s4, 0xffc0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -113,8 +112,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0xffffffc0, 4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4ffc0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo: @@ -133,14 +132,14 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 4, 0xffffffc0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc00004 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffc0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0 ; GFX8-NEXT: v_add_u16_e32 v2, 4, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 @@ -152,13 +151,10 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffffffc0 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0xffc0ffc0 +; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: @@ -182,12 +178,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, 0xffffffc0, 4 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0x4ffc0 +; GFX9-NEXT: s_add_i32 s1, s1, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: @@ -210,12 +204,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, 4, 0xffffffc0 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0xffc00004 +; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 71ee562f0ecc2..c1896f81ef296 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -522,8 +522,7 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_ashr_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_ashrrev_i16 v0, s4, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 2695952bfd193..1f9c3bc60876e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -135,24 +135,24 @@ define void @constrained_if_register_class() { ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cselect_b32 s4, 1, 0 -; CHECK-NEXT: s_xor_b32 s4, s4, 1 -; CHECK-NEXT: s_and_b32 s4, s4, 1 -; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cselect_b32 s5, 1, 0 +; CHECK-NEXT: s_xor_b32 s5, s5, -1 +; CHECK-NEXT: s_and_b32 s5, s5, 1 +; CHECK-NEXT: s_mov_b32 s4, -1 +; CHECK-NEXT: s_cmp_lg_u32 s5, 0 ; CHECK-NEXT: s_cbranch_scc0 BB4_6 ; CHECK-NEXT: ; %bb.1: ; %bb2 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+4 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; CHECK-NEXT: s_getpc_b64 s[6:7] +; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: flat_load_dword v0, v[0:1] -; CHECK-NEXT: s_mov_b32 s4, -1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0 ; CHECK-NEXT: s_xor_b64 s[8:9], vcc, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll new file mode 100644 index 0000000000000..81f6b8e71254c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s + +; Make sure legalizer info doesn't assert on dummy targets + +define i16 @vop3p_add_i16(i16 %arg0) #0 { + ; CHECK-LABEL: name: vop3p_add_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %add = add i16 %arg0, %arg0 + ret i16 %add +} + +define <2 x i16> @vop3p_add_v2i16(<2 x i16> %arg0) #0 { + ; CHECK-LABEL: name: vop3p_add_v2i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; CHECK: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC2]] + ; CHECK: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[TRUNC3]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] + ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %add = add <2 x i16> %arg0, %arg0 + ret <2 x i16> %add +} + +define i16 @halfinsts_add_i16(i16 %arg0) #1 { + ; CHECK-LABEL: name: halfinsts_add_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK: $vgpr0 = COPY [[COPY3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 + %add = add i16 %arg0, %arg0 + ret i16 %add +} + +define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 { + ; CHECK-LABEL: name: halfinsts_add_v2i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY4]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY5]], [[COPY6]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) + ; CHECK: $vgpr0 = COPY [[COPY7]](s32) + ; CHECK: $vgpr1 = COPY [[COPY8]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY9]], implicit $vgpr0, implicit $vgpr1 + %add = add <2 x i16> %arg0, %arg0 + ret <2 x i16> %add +} + +attributes #0 = { "target-features"="+vop3p" } +attributes #0 = { "target-features"="+16-bit-insts" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll new file mode 100644 index 0000000000000..28c0651b10fd2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -0,0 +1,769 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX9-NEXT: s_lshl_b32 m0, s4, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX9-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX8-NEXT: s_lshl_b32 m0, s4, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX8-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX7-NEXT: s_lshl_b32 m0, s4, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX7-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 %idx + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(<4 x i128> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX9-NEXT: s_lshl_b32 s0, s2, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, v3 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v18 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[10:11] +; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; GFX8-NEXT: s_lshl_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 m0, s0, 1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_movrels_b32_e32 v1, v3 +; GFX8-NEXT: v_movrels_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: s_lshl_b32 s0, s2, 1 +; GFX7-NEXT: s_lshl_b32 m0, s0, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_movrels_b32_e32 v1, v3 +; GFX7-NEXT: v_movrels_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 %idx + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[3:4] +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v10, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v11, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v2, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, v3, v7, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 1, v2 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[8:11], 0 addr64 offset:16 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 1, v16 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 %idx + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX9-NEXT: v_mov_b32_e32 v11, s10 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX9-NEXT: v_mov_b32_e32 v13, s12 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX9-NEXT: v_mov_b32_e32 v14, s13 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, s14 +; GFX9-NEXT: v_mov_b32_e32 v16, s15 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, s9 +; GFX9-NEXT: v_mov_b32_e32 v13, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX8-NEXT: v_mov_b32_e32 v11, s10 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX8-NEXT: v_mov_b32_e32 v13, s12 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX8-NEXT: v_mov_b32_e32 v14, s13 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc +; GFX8-NEXT: v_mov_b32_e32 v15, s14 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX8-NEXT: v_mov_b32_e32 v16, s15 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v7, s4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v9, s6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_mov_b32_e32 v11, s8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX8-NEXT: v_mov_b32_e32 v12, s9 +; GFX8-NEXT: v_mov_b32_e32 v13, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s12 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s14 +; GFX8-NEXT: v_mov_b32_e32 v9, s15 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v6, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, s7 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX7-NEXT: v_mov_b32_e32 v10, s9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX7-NEXT: v_mov_b32_e32 v11, s10 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX7-NEXT: v_mov_b32_e32 v12, s11 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX7-NEXT: v_mov_b32_e32 v13, s12 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX7-NEXT: v_mov_b32_e32 v14, s13 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc +; GFX7-NEXT: v_mov_b32_e32 v15, s14 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX7-NEXT: v_mov_b32_e32 v16, s15 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v7, s4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, s6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX7-NEXT: v_mov_b32_e32 v10, s7 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX7-NEXT: v_mov_b32_e32 v11, s8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX7-NEXT: v_mov_b32_e32 v12, s9 +; GFX7-NEXT: v_mov_b32_e32 v13, s10 +; GFX7-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, s12 +; GFX7-NEXT: v_mov_b32_e32 v7, s13 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, s14 +; GFX7-NEXT: v_mov_b32_e32 v9, s15 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 %idx + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(<4 x i128> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i128_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 0 + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(<4 x i128> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i128_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s2, s6 +; GCN-NEXT: s_mov_b32 s3, s7 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 1 + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(<4 x i128> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i128_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s8 +; GCN-NEXT: s_mov_b32 s1, s9 +; GCN-NEXT: s_mov_b32 s2, s10 +; GCN-NEXT: s_mov_b32 s3, s11 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 2 + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(<4 x i128> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i128_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s12 +; GCN-NEXT: s_mov_b32 s1, s13 +; GCN-NEXT: s_mov_b32 s2, s14 +; GCN-NEXT: s_mov_b32 s3, s15 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 3 + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_idx0(<4 x i128> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i128_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 0 + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_idx1(<4 x i128> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i128_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: v_mov_b32_e32 v3, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 1 + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_idx2(<4 x i128> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i128_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v8 +; GFX8-NEXT: v_mov_b32_e32 v1, v9 +; GFX8-NEXT: v_mov_b32_e32 v2, v10 +; GFX8-NEXT: v_mov_b32_e32 v3, v11 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 2 + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_idx3(<4 x i128> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i128_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v12 +; GFX8-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: v_mov_b32_e32 v3, v15 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v12 +; GFX7-NEXT: v_mov_b32_e32 v1, v13 +; GFX7-NEXT: v_mov_b32_e32 v2, v14 +; GFX7-NEXT: v_mov_b32_e32 v3, v15 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 3 + ret i128 %element +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll new file mode 100644 index 0000000000000..13d7fbeda0f6d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -0,0 +1,802 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_lshr_b32 s2, s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cselect_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s4, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 4 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(<4 x i16> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_lshr_b32 s0, s2, 1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshl_b32 s0, s1, 4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s1, 4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_lshl_b32 s0, s1, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 %idx + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_vgpr_idx(<4 x i16> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i16_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 %idx) { +; GCN-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(<4 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i16_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 0 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(<4 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i16_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 1 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(<4 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i16_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 2 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx3(<4 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i16_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 3 + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_idx0(<4 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i16_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 0 + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_idx1(<4 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i16_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 1 + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_idx2(<4 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i16_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 2 + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_idx3(<4 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i16_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 3 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(<8 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_lshr_b32 s5, s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s5, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cselect_b32 s0, s1, s0 +; GCN-NEXT: s_cmp_eq_u32 s5, 2 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: s_cmp_eq_u32 s5, 3 +; GCN-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-NEXT: s_and_b32 s1, s4, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 4 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(<8 x i16> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_lshr_b32 s0, s2, 1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: s_lshl_b32 s0, s1, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: s_lshl_b32 s0, s1, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v8i16_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: s_lshl_b32 s0, s1, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 %idx + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_vgpr_idx(<8 x i16> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v8i16_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(<8 x i16> addrspace(4)* inreg %ptr, i32 %idx) { +; GCN-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 0 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 1 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 2 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 3 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 4 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx5: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s2, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 5 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx6: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s3 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 6 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx7(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx7: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s3, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 7 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx0(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 0 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx1(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 1 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx2(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 2 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx3(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 3 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx4(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 4 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx5(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 5 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx6(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 6 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx7(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 7 + ret i16 %element +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll new file mode 100644 index 0000000000000..95b4177abbcab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -0,0 +1,3135 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(<4 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s5, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s5 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s5 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s5 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s5 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s4, 3 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX9-NEXT: s_lshl_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v4i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_and_b32 s0, s2, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v4i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: s_lshl_b32 s0, s1, 3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 %idx + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* inreg %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_sgpr_v4i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s3, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, s4 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s3, s4 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s1, s0, 8 +; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: s_and_b32 s1, s1, s4 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s3, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(<4 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i8_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s0, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s1, 8 +; GCN-NEXT: s_and_b32 s2, s2, s0 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s3, s0 +; GCN-NEXT: s_and_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 0 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(<4 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i8_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s0, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s1, 8 +; GCN-NEXT: s_and_b32 s2, s2, s0 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s3, s0 +; GCN-NEXT: s_and_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshr_b32 s0, s0, 8 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 1 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(<4 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i8_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s0, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s1, 8 +; GCN-NEXT: s_and_b32 s2, s2, s0 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s3, s0 +; GCN-NEXT: s_and_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 2 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(<4 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i8_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s0, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s1, 8 +; GCN-NEXT: s_and_b32 s2, s2, s0 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s3, s0 +; GCN-NEXT: s_and_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshr_b32 s0, s0, 24 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 3 + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i8_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 0 + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i8_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 1 + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i8_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 2 + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i8_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 3 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s9, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s0, 8 +; GCN-NEXT: s_and_b32 s2, s2, s9 +; GCN-NEXT: s_lshr_b32 s3, s0, 16 +; GCN-NEXT: s_lshr_b32 s5, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s3, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s5, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshr_b32 s6, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s6, s9 +; GCN-NEXT: s_lshr_b32 s7, s1, 16 +; GCN-NEXT: s_lshr_b32 s8, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s7, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s8, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshr_b32 s2, s4, 2 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s4, 3 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v8i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: s_lshr_b32 s3, s2, 2 +; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX9-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_lshl_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v8i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, 2 +; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s1, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v8i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s2, 2 +; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: s_lshl_b32 s0, s2, 3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 %idx + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v8i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v5 +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v10, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v3 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* inreg %ptr, i32 %idx) { +; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s8, 0xff +; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s0, 8 +; GCN-NEXT: s_and_b32 s2, s2, s8 +; GCN-NEXT: s_lshr_b32 s3, s0, 16 +; GCN-NEXT: s_lshr_b32 s4, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s3, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s4, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshr_b32 s5, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s5, s8 +; GCN-NEXT: s_lshr_b32 s6, s1, 16 +; GCN-NEXT: s_lshr_b32 s7, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s6, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s7, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 0 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 8 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 1 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 2 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 24 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 3 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 8 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 4 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx5: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 8 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 8 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 5 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx6: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 8 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 6 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx7: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 8 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 24 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 7 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 0 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 1 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 2 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 3 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 4 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 5 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 6 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 7 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s17, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s5, s0, 8 +; GCN-NEXT: s_and_b32 s5, s5, s17 +; GCN-NEXT: s_lshr_b32 s6, s0, 16 +; GCN-NEXT: s_lshr_b32 s7, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s0, s0, s5 +; GCN-NEXT: s_and_b32 s5, s6, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_or_b32 s0, s0, s5 +; GCN-NEXT: s_and_b32 s5, s7, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshr_b32 s8, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s5 +; GCN-NEXT: s_and_b32 s5, s8, s17 +; GCN-NEXT: s_lshr_b32 s9, s1, 16 +; GCN-NEXT: s_lshr_b32 s10, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s1, s1, s5 +; GCN-NEXT: s_and_b32 s5, s9, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_or_b32 s1, s1, s5 +; GCN-NEXT: s_and_b32 s5, s10, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshr_b32 s11, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s5 +; GCN-NEXT: s_and_b32 s5, s11, s17 +; GCN-NEXT: s_lshr_b32 s12, s2, 16 +; GCN-NEXT: s_lshr_b32 s13, s2, 24 +; GCN-NEXT: s_and_b32 s2, s2, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_and_b32 s5, s12, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_and_b32 s5, s13, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshr_b32 s14, s3, 8 +; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_and_b32 s5, s14, s17 +; GCN-NEXT: s_lshr_b32 s15, s3, 16 +; GCN-NEXT: s_lshr_b32 s16, s3, 24 +; GCN-NEXT: s_and_b32 s3, s3, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_and_b32 s5, s15, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_and_b32 s5, s16, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_lshr_b32 s5, s4, 2 +; GCN-NEXT: s_cmp_eq_u32 s5, 1 +; GCN-NEXT: s_cselect_b32 s0, s1, s0 +; GCN-NEXT: s_cmp_eq_u32 s5, 2 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: s_cmp_eq_u32 s5, 3 +; GCN-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-NEXT: s_and_b32 s1, s4, 3 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr + %element = extractelement <16 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v16i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: s_lshr_b32 s3, s2, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v10, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v6 +; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v7 +; GFX9-NEXT: v_and_b32_sdwa v14, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v2, v2, s1, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v11 +; GFX9-NEXT: v_or3_b32 v1, v1, v12, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v3, v3, v4, v5 +; GFX9-NEXT: v_or3_b32 v2, v2, v14, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_or3_b32 v3, v3, v16, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: s_lshl_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v16i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: s_lshr_b32 s0, s2, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v12, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_and_b32_sdwa v15, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX8-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: s_lshl_b32 s0, s1, 3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v16i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s2, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX7-NEXT: v_and_b32_e32 v11, v11, v4 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v10 +; GFX7-NEXT: v_and_b32_e32 v12, v12, v4 +; GFX7-NEXT: v_and_b32_e32 v14, v14, v4 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_and_b32_e32 v13, v13, v4 +; GFX7-NEXT: v_and_b32_e32 v15, v15, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX7-NEXT: v_and_b32_e32 v4, v16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v13 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: s_lshl_b32 s0, s2, 3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 %idx + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v16i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v12, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v3, v3, s5, v8 +; GFX9-NEXT: v_and_or_b32 v4, v4, s5, v9 +; GFX9-NEXT: v_and_b32_sdwa v16, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v6, v0, v1 +; GFX9-NEXT: v_or3_b32 v1, v3, v12, v13 +; GFX9-NEXT: v_or3_b32 v3, v4, v14, v15 +; GFX9-NEXT: v_and_or_b32 v5, v5, s5, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_or3_b32 v4, v5, v16, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_or3_b32 v0, v0, v18, v19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v14, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX8-NEXT: v_and_b32_sdwa v17, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v18, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX8-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_mov_b32_e32 v0, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v12, v12, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 24, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v13, v13, v0 +; GFX7-NEXT: v_and_b32_e32 v15, v15, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_and_b32_e32 v14, v14, v0 +; GFX7-NEXT: v_and_b32_e32 v16, v16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v6, v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_and_b32_e32 v0, v17, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v15 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v16 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v14 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* inreg %ptr, i32 %idx) { +; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s16, 0xff +; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s4, s0, 8 +; GCN-NEXT: s_and_b32 s4, s4, s16 +; GCN-NEXT: s_lshr_b32 s5, s0, 16 +; GCN-NEXT: s_lshr_b32 s6, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_and_b32 s4, s5, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_and_b32 s4, s6, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshr_b32 s7, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_and_b32 s4, s7, s16 +; GCN-NEXT: s_lshr_b32 s8, s1, 16 +; GCN-NEXT: s_lshr_b32 s9, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_or_b32 s1, s1, s4 +; GCN-NEXT: s_and_b32 s4, s8, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_or_b32 s1, s1, s4 +; GCN-NEXT: s_and_b32 s4, s9, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshr_b32 s10, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s4 +; GCN-NEXT: s_and_b32 s4, s10, s16 +; GCN-NEXT: s_lshr_b32 s11, s2, 16 +; GCN-NEXT: s_lshr_b32 s12, s2, 24 +; GCN-NEXT: s_and_b32 s2, s2, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_and_b32 s4, s11, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_and_b32 s4, s12, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshr_b32 s13, s3, 8 +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_and_b32 s4, s13, s16 +; GCN-NEXT: s_lshr_b32 s14, s3, 16 +; GCN-NEXT: s_lshr_b32 s15, s3, 24 +; GCN-NEXT: s_and_b32 s3, s3, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_or_b32 s3, s3, s4 +; GCN-NEXT: s_and_b32 s4, s14, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_or_b32 s3, s3, s4 +; GCN-NEXT: s_and_b32 s4, s15, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: s_or_b32 s3, s3, s4 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr + %element = extractelement <16 x i8> %vector, i32 %idx + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 0 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 1 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 2 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 3 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 4 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 5 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 6 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 7 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 8 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx9: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx9: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 9 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx10: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx10: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 10 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx11: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx11: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 11 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx12: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 12 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx13: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx13: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 13 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx14: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx14: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 14 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx15: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 15 + ret i8 %element +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 8e4a071701b35..b4afc48f98ba8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -1492,7 +1492,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 -; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_mov_b32_e32 v23, v1 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 @@ -1501,7 +1501,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4 @@ -2123,7 +2123,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18 ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 -; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_mov_b32_e32 v23, v1 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 @@ -2137,7 +2137,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4 @@ -4111,7 +4111,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec, ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v17, v2 +; MOVREL-NEXT: v_mov_b32_e32 v19, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 ; MOVREL-NEXT: v_mov_b32_e32 v18, v3 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 @@ -4119,7 +4119,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec, ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 ; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v17, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v19, v14, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo @@ -4251,42 +4251,42 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v18, s15 -; MOVREL-NEXT: v_mov_b32_e32 v17, s14 -; MOVREL-NEXT: v_mov_b32_e32 v16, s13 -; MOVREL-NEXT: v_mov_b32_e32 v15, s12 -; MOVREL-NEXT: v_mov_b32_e32 v14, s11 -; MOVREL-NEXT: v_mov_b32_e32 v13, s10 -; MOVREL-NEXT: v_mov_b32_e32 v12, s9 -; MOVREL-NEXT: v_mov_b32_e32 v11, s8 -; MOVREL-NEXT: v_mov_b32_e32 v10, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s6 -; MOVREL-NEXT: v_mov_b32_e32 v8, s5 -; MOVREL-NEXT: v_mov_b32_e32 v7, s4 -; MOVREL-NEXT: v_mov_b32_e32 v6, s3 -; MOVREL-NEXT: v_mov_b32_e32 v5, s2 -; MOVREL-NEXT: v_mov_b32_e32 v4, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_mov_b32_e32 v20, s15 +; MOVREL-NEXT: v_mov_b32_e32 v19, s14 +; MOVREL-NEXT: v_mov_b32_e32 v18, s13 +; MOVREL-NEXT: v_mov_b32_e32 v17, s12 +; MOVREL-NEXT: v_mov_b32_e32 v16, s11 +; MOVREL-NEXT: v_mov_b32_e32 v15, s10 +; MOVREL-NEXT: v_mov_b32_e32 v14, s9 +; MOVREL-NEXT: v_mov_b32_e32 v13, s8 +; MOVREL-NEXT: v_mov_b32_e32 v12, s7 +; MOVREL-NEXT: v_mov_b32_e32 v11, s6 +; MOVREL-NEXT: v_mov_b32_e32 v10, s5 +; MOVREL-NEXT: v_mov_b32_e32 v9, s4 +; MOVREL-NEXT: v_mov_b32_e32 v8, s3 +; MOVREL-NEXT: v_mov_b32_e32 v7, s2 +; MOVREL-NEXT: v_mov_b32_e32 v6, s1 +; MOVREL-NEXT: v_mov_b32_e32 v5, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v3, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v6, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v5, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v7, v0, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v8, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 -; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v7, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v11, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v12, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v13, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v14, v1, s1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v3 +; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v6 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v7 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 @@ -4448,7 +4448,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; MOVREL-NEXT: v_mov_b32_e32 v13, v2 +; MOVREL-NEXT: v_mov_b32_e32 v15, v2 ; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo @@ -4457,7 +4457,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec, ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo @@ -4514,7 +4514,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec, ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; MOVREL-NEXT: v_mov_b32_e32 v13, v2 +; MOVREL-NEXT: v_mov_b32_e32 v15, v2 ; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo @@ -4522,7 +4522,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec, ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir index 51a116a944ad6..47b4a5c400d37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir @@ -94,8 +94,7 @@ body: | ; GFX6-LABEL: name: add_neg_inline_const_64_to_sub_s32_v ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967232, implicit $exec - ; GFX6: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: %2:vgpr_32, dead %3:sreg_64 = V_SUB_CO_U32_e64 [[COPY]], 64, 0, implicit $exec ; GFX6: S_ENDPGM 0, implicit %2 ; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_v ; GFX9: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.reloc.constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.reloc.constant.mir new file mode 100644 index 0000000000000..76582ad083f69 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.reloc.constant.mir @@ -0,0 +1,50 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- | + + define void @reloc_constant_sgpr32() { ret void } + define void @reloc_constant_vgpr32() { ret void } + + declare i32 @llvm.amdgcn.reloc.constant(metadata) + + !0 = !{!"arst"} + +... + +--- +name: reloc_constant_sgpr32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + + ; GCN-LABEL: name: reloc_constant_sgpr32 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @arst + ; GCN: $sgpr0 = COPY [[S_MOV_B32_]] + ; GCN: S_ENDPGM 0, implicit $sgpr0 + %0:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0 + $sgpr0 = COPY %0 + S_ENDPGM 0, implicit $sgpr0 + +... + +--- +name: reloc_constant_vgpr32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + + ; GCN-LABEL: name: reloc_constant_vgpr32 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @arst, implicit $exec + ; GCN: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0 + $vgpr0 = COPY %0 + S_ENDPGM 0, implicit $vgpr0 + +... + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir index ecfb9b618f5ed..81437acbbbc53 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir @@ -20,7 +20,7 @@ body: | ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_AND_B64_]] ; WAVE32-LABEL: name: and_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -30,7 +30,7 @@ body: | ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -386,7 +386,7 @@ body: | ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_AND_B64_]] ; WAVE32-LABEL: name: and_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -397,7 +397,7 @@ body: | ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -425,24 +425,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B32_1]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -471,24 +471,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_AND_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_AND_B32_1]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir index 58d01774f745b..dcad0a85e8e0e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir @@ -22,22 +22,88 @@ body: | ... --- +name: anyext_sgpr_s32_to_sgpr_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: anyext_sgpr_s32_to_sgpr_s64 + ; GCN: liveins: $sgpr0 + ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s64) = G_ANYEXT %0 + S_ENDPGM 0, implicit %1 + +... -name: anyext_sgpr_s16_to_sgpr_s64 +--- +name: anyext_sgpr_s16_to_sgpr_s64 legalized: true regBankSelected: true -body: | +tracksRegLiveness: true +body: | bb.0: liveins: $sgpr0 ; GCN-LABEL: name: anyext_sgpr_s16_to_sgpr_s64 + ; GCN: liveins: $sgpr0 ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[COPY]] - ; GCN: $sgpr0_sgpr1 = COPY [[COPY1]] + ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:sgpr(s64) = G_ANYEXT %1 - $sgpr0_sgpr1 = COPY %2 + S_ENDPGM 0, implicit %2 + +... + +--- +name: anyext_vgpr_s32_to_vgpr_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: anyext_vgpr_s32_to_vgpr_s64 + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s64) = G_ANYEXT %0 + S_ENDPGM 0, implicit %1 + +... + +--- +name: anyext_vgpr_s16_to_vgpr_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: anyext_vgpr_s16_to_vgpr_s64 + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s16) = G_TRUNC %0 + %2:vgpr(s64) = G_ANYEXT %1 + S_ENDPGM 0, implicit %2 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir new file mode 100644 index 0000000000000..834380d617912 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir @@ -0,0 +1,173 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s + + +--- +name: atomic_cmpxchg_s32_region +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX6-LABEL: name: atomic_cmpxchg_s32_region + ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX7-LABEL: name: atomic_cmpxchg_s32_region + ; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX7: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX9-LABEL: name: atomic_cmpxchg_s32_region + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_ATOMIC_CMPXCHG %0, %1, %2 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %3 + +... + +--- +name: atomic_cmpxchg_s32_region_gep4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX6-LABEL: name: atomic_cmpxchg_s32_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + ; GFX6: %4:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 %4, [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX7-LABEL: name: atomic_cmpxchg_s32_region_gep4 + ; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX7: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX9-LABEL: name: atomic_cmpxchg_s32_region_gep4 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_CONSTANT i32 4 + %4:vgpr(p2) = G_PTR_ADD %0, %3 + %5:vgpr(s32) = G_ATOMIC_CMPXCHG %4, %1, %2 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %5 + +... + +--- +name: atomic_cmpxchg_s64_region +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + + ; GFX6-LABEL: name: atomic_cmpxchg_s64_region + ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX7-LABEL: name: atomic_cmpxchg_s64_region + ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX7: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX9-LABEL: name: atomic_cmpxchg_s64_region + ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX9: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s64) = COPY $vgpr1_vgpr2 + %2:vgpr(s64) = COPY $vgpr3_vgpr4 + %3:vgpr(s64) = G_ATOMIC_CMPXCHG %0, %1, %2 :: (load store seq_cst 8, addrspace 2) + $vgpr0_vgpr1 = COPY %3 + +... + +--- +name: atomic_cmpxchg_s64_region_gep4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + + ; GFX6-LABEL: name: atomic_cmpxchg_s64_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX7-LABEL: name: atomic_cmpxchg_s64_region_gep4 + ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX7: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX9-LABEL: name: atomic_cmpxchg_s64_region_gep4 + ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX9: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 2) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s64) = COPY $vgpr1_vgpr2 + %2:vgpr(s64) = COPY $vgpr3_vgpr4 + %3:vgpr(s32) = G_CONSTANT i32 4 + %4:vgpr(p2) = G_PTR_ADD %0, %3 + %5:vgpr(s64) = G_ATOMIC_CMPXCHG %0, %1, %2 :: (load store seq_cst 8, addrspace 2) + $vgpr0_vgpr1 = COPY %5 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir new file mode 100644 index 0000000000000..26619a1753eb5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-fadd-region.mir @@ -0,0 +1,116 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s + +# GFX6/7 selection should fail. +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX6 %s + +--- +name: atomicrmw_fadd_s32_region +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] + ; GFX9-LABEL: name: atomicrmw_fadd_s32_region + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] + ; GFX6-LABEL: name: atomicrmw_fadd_s32_region + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p2) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_FADD [[COPY]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[ATOMICRMW_FADD]](s32) + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_ATOMICRMW_FADD %0(p2), %1 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %2 + +... + +--- +name: atomicrmw_fadd_s32_region_noret +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region_noret + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9-LABEL: name: atomicrmw_fadd_s32_region_noret + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6-LABEL: name: atomicrmw_fadd_s32_region_noret + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p2) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_FADD [[COPY]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_ATOMICRMW_FADD %0(p2), %1 :: (load store seq_cst 4, addrspace 2) + +... + +--- +name: atomicrmw_fadd_s32_region_gep4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: atomicrmw_fadd_s32_region_gep4 + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: $m0 = S_MOV_B32 -1 + ; GFX8: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX8: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] + ; GFX9-LABEL: name: atomicrmw_fadd_s32_region_gep4 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_ADD_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_ADD_RTN_F32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_ADD_RTN_F32_]] + ; GFX6-LABEL: name: atomicrmw_fadd_s32_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p2) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p2) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[ATOMICRMW_FADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_FADD [[PTR_ADD]](p2), [[COPY1]] :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[ATOMICRMW_FADD]](s32) + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_CONSTANT i32 4 + %3:vgpr(p2) = G_PTR_ADD %0, %2 + %4:vgpr(s32) = G_ATOMICRMW_FADD %3(p2), %1 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %4 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir new file mode 100644 index 0000000000000..0552551f35105 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-xchg-region.mir @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s + + +--- +name: atomicrmw_xchg_s32_region +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: atomicrmw_xchg_s32_region + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX7-LABEL: name: atomicrmw_xchg_s32_region + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX7: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX9-LABEL: name: atomicrmw_xchg_s32_region + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_ATOMICRMW_XCHG %0(p2), %1 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %2 + +... + +--- +name: atomicrmw_xchg_s32_region_gep4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: atomicrmw_xchg_s32_region_gep4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec + ; GFX6: %3:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 %3, [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX6: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX7-LABEL: name: atomicrmw_xchg_s32_region_gep4 + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX7: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + ; GFX9-LABEL: name: atomicrmw_xchg_s32_region_gep4 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[DS_WRXCHG_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_WRXCHG_RTN_B32 [[COPY]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 2) + ; GFX9: $vgpr0 = COPY [[DS_WRXCHG_RTN_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = G_CONSTANT i32 4 + %3:vgpr(p2) = G_PTR_ADD %0, %2 + %4:vgpr(s32) = G_ATOMICRMW_XCHG %3(p2), %1 :: (load store seq_cst 4, addrspace 2) + $vgpr0 = COPY %4 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir index 496d496a42c58..6adb3549778c1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir @@ -20,7 +20,7 @@ body: | ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GCN: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY2]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: bb.1: @@ -46,7 +46,7 @@ body: | ; GCN-LABEL: name: brcond_scc_impdef ; GCN: bb.0: ; GCN: successors: %bb.1(0x80000000) - ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN: $scc = COPY [[DEF]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: bb.1: @@ -73,7 +73,7 @@ body: | ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GCN: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GCN: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY2]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir index c380d3c77defc..056ea79a98988 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir @@ -430,3 +430,273 @@ body: | %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %3, %4 S_ENDPGM 0, implicit %5 ... + +--- +name: test_build_vector_trunc_s_v2s16_constant_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s32) = G_CONSTANT i32 123 + %1:sgpr(s32) = G_CONSTANT i32 456 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_constant_impdef +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_impdef + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s32) = G_CONSTANT i32 123 + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_impdef_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_CONSTANT i32 123 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_impdef_impdef +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_impdef + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: S_ENDPGM 0, implicit [[DEF]] + %0:sgpr(s32) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s16) = G_CONSTANT i16 123 + %1:sgpr(s16) = G_CONSTANT i16 456 + %2:sgpr(s32) = G_ZEXT %0 + %3:sgpr(s32) = G_ZEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 1048576, implicit-def $scc + ; GFX9: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_MOV_B32_]], 1048576, implicit-def $scc + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_BFE_U32_]], [[S_BFE_U32_1]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_IMPLICIT_DEF + %1:sgpr(s16) = G_CONSTANT i16 123 + %2:sgpr(s32) = G_ZEXT %0 + %3:sgpr(s32) = G_ZEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294836208 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s16) = G_CONSTANT i16 -16 + %1:sgpr(s16) = G_CONSTANT i16 -3 + %2:sgpr(s32) = G_SEXT %0 + %3:sgpr(s32) = G_SEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[S_MOV_B32_1]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_CONSTANT i16 123 + %1:sgpr(s16) = G_CONSTANT i16 456 + %2:sgpr(s32) = G_ANYEXT %0 + %3:sgpr(s32) = G_ANYEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_IMPLICIT_DEF + %1:sgpr(s16) = G_CONSTANT i16 123 + %2:sgpr(s32) = G_ANYEXT %0 + %3:sgpr(s32) = G_ANYEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_var_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_constant + ; GFX9: liveins: $sgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_CONSTANT i32 456 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_constant_var +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_var + ; GFX9: liveins: $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_CONSTANT i32 456 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_var_0 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_0 + ; GFX9: liveins: $sgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_CONSTANT i32 0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_0_var +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_0_var + ; GFX9: liveins: $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_CONSTANT i32 0 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir index db37495f052d8..c8762c0d578eb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir @@ -1,55 +1,147 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN --- - -name: constant +name: constant_v_s32 legalized: true regBankSelected: true tracksRegLiveness: true - body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GCN-LABEL: name: constant - %0:vgpr(p1) = COPY $vgpr0_vgpr1 - %1:vgpr(p1) = COPY $vgpr2_vgpr3 - - ; GCN: %{{[0-9]+}}:sreg_32 = S_MOV_B32 1 - %2:sreg_32(s32) = G_CONSTANT i32 1 - - ; GCN: [[LO0:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: [[HI0:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GCN: %{{[0-9]+}}:sreg_64 = REG_SEQUENCE [[LO0]], %subreg.sub0, [[HI0]], %subreg.sub1 - %3:sgpr(s64) = G_CONSTANT i64 4294967296 - - ; GCN: %{{[0-9]+}}:sreg_32 = S_MOV_B32 1065353216 - %4:sgpr(s32) = G_FCONSTANT float 1.0 + ; GCN-LABEL: name: constant_v_s32 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec + ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]] + %0:vgpr(s32) = G_CONSTANT i32 0 + %1:vgpr(s32) = G_CONSTANT i32 1 + %2:vgpr(s32) = G_CONSTANT i32 -1 + %3:vgpr(s32) = G_CONSTANT i32 -54 + %4:vgpr(s32) = G_CONSTANT i32 27 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4 +... - ; GCN: %5:sreg_64 = S_MOV_B64 4607182418800017408 - %5:sgpr(s64) = G_FCONSTANT double 1.0 +--- +name: constant_s_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true - ; GCN: [[LO1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN: [[HI1:%[0-9]+]]:sreg_32 = S_MOV_B32 1076101120 - ; GCN: %{{[0-9]+}}:sreg_64 = REG_SEQUENCE [[LO1]], %subreg.sub0, [[HI1]], %subreg.sub1 - %6:sgpr(s64) = G_FCONSTANT double 10.0 +body: | + bb.0: + ; GCN-LABEL: name: constant_s_s32 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54 + ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27 + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]] + %0:sgpr(s32) = G_CONSTANT i32 0 + %1:sgpr(s32) = G_CONSTANT i32 1 + %2:sgpr(s32) = G_CONSTANT i32 -1 + %3:sgpr(s32) = G_CONSTANT i32 -54 + %4:sgpr(s32) = G_CONSTANT i32 27 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4 +... - ; GCN: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 1 - %7:vgpr(s32) = G_CONSTANT i32 1 +# FIXME +# --- +# name: constant_v_s16 +# legalized: true +# regBankSelected: true +# tracksRegLiveness: true - ; GCN: [[LO2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0 - ; GCN: [[HI2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1 - ; GCN: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE [[LO2]], %subreg.sub0, [[HI2]], %subreg.sub1 - %8:vgpr(s64) = G_CONSTANT i64 4294967296 +# body: | +# bb.0: +# %0:vgpry(s16) = G_CONSTANT i16 0 +# %1:vgpr(s16) = G_CONSTANT i16 1 +# %2:vgpr(s16) = G_CONSTANT i16 -1 +# %3:vgpr(s16) = G_CONSTANT i16 -54 +# %4:vgpr(s16) = G_CONSTANT i16 27 +# S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4 +# ... - ; GCN: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 1065353216 - %9:vgpr(s32) = G_FCONSTANT float 1.0 +--- +name: constant_v_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true - ; GCN: [[LO3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0 - ; GCN: [[HI3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248 - ; GCN: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE [[LO3]], %subreg.sub0, [[HI3]], %subreg.sub1 - %10:vgpr(s64) = G_FCONSTANT double 1.0 +body: | + bb.0: + ; GCN-LABEL: name: constant_v_s64 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GCN: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec + ; GCN: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec + ; GCN: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GCN: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec + ; GCN: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec + ; GCN: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]] + %0:vgpr(s64) = G_CONSTANT i64 0 + %1:vgpr(s64) = G_CONSTANT i64 1 + %2:vgpr(s64) = G_CONSTANT i64 -1 + %3:vgpr(s64) = G_CONSTANT i64 -54 + %4:vgpr(s64) = G_CONSTANT i64 27 + %5:vgpr(s64) = G_CONSTANT i64 4294967295 + %6:vgpr(s64) = G_CONSTANT i64 4294967296 + %7:vgpr(s64) = G_CONSTANT i64 18446744004990098135 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 +... - S_ENDPGM 0, implicit %2, implicit %4, implicit %5, implicit %6, implicit %8, implicit %3, implicit %5, implicit %7, implicit %9, implicit %10 +--- +name: constant_s_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_s64 + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GCN: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GCN: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GCN: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1 + ; GCN: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255 + ; GCN: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 + ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1 + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + %0:sgpr(s64) = G_CONSTANT i64 0 + %1:sgpr(s64) = G_CONSTANT i64 1 + %2:sgpr(s64) = G_CONSTANT i64 -1 + %3:sgpr(s64) = G_CONSTANT i64 -54 + %4:sgpr(s64) = G_CONSTANT i64 27 + %5:sgpr(s64) = G_CONSTANT i64 4294967295 + %6:sgpr(s64) = G_CONSTANT i64 4294967296 + %7:sgpr(s64) = G_CONSTANT i64 18446744004990098135 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir index da730b0c9fa9a..9edad11a23418 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract-vector-elt.mir @@ -315,7 +315,7 @@ body: | ; MOVREL-LABEL: name: extract_vector_elt_s_s32_v8s32_idx_offset_m1 ; MOVREL: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; MOVREL: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; MOVREL: $m0 = COPY [[S_ADD_I32_]] ; MOVREL: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub0, implicit $m0, implicit [[COPY]] @@ -323,7 +323,7 @@ body: | ; GPRIDX-LABEL: name: extract_vector_elt_s_s32_v8s32_idx_offset_m1 ; GPRIDX: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GPRIDX: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; GPRIDX: $m0 = COPY [[S_ADD_I32_]] ; GPRIDX: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub0, implicit $m0, implicit [[COPY]] @@ -468,7 +468,7 @@ body: | ; MOVREL-LABEL: name: extract_vector_elt_s_s64_v8s64_idx_offset_m1 ; MOVREL: [[COPY:%[0-9]+]]:sgpr_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; MOVREL: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; MOVREL: $m0 = COPY [[S_ADD_I32_]] ; MOVREL: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub0_sub1, implicit $m0, implicit [[COPY]] @@ -476,7 +476,7 @@ body: | ; GPRIDX-LABEL: name: extract_vector_elt_s_s64_v8s64_idx_offset_m1 ; GPRIDX: [[COPY:%[0-9]+]]:sgpr_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GPRIDX: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; GPRIDX: $m0 = COPY [[S_ADD_I32_]] ; GPRIDX: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub0_sub1, implicit $m0, implicit [[COPY]] @@ -502,13 +502,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; MOVREL: $m0 = COPY [[COPY1]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v2s32 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -530,13 +530,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; MOVREL: $m0 = COPY [[COPY1]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v3s32 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 @@ -558,13 +558,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; MOVREL: $m0 = COPY [[COPY1]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v4s32 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 @@ -586,13 +586,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; MOVREL: $m0 = COPY [[COPY1]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v8s32 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -614,13 +614,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_512 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; MOVREL: $m0 = COPY [[COPY1]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v16s32 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_512 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -642,13 +642,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_1024 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr40 ; MOVREL: $m0 = COPY [[COPY1]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v32s32 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_1024 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr40 ; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<32 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 @@ -670,13 +670,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; MOVREL: $m0 = COPY [[COPY1]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub1, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub1, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_1 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub1, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub1, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -699,18 +699,18 @@ body: | ; MOVREL-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_m1 ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; MOVREL: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; MOVREL: $m0 = COPY [[S_ADD_I32_]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_m1 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GPRIDX: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_I32_]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -734,13 +734,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; MOVREL: $m0 = COPY [[COPY1]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub7, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub7, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_7 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub7, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub7, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -766,7 +766,7 @@ body: | ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8 ; MOVREL: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; MOVREL: $m0 = COPY [[S_ADD_I32_]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_8 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -774,7 +774,7 @@ body: | ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8 ; GPRIDX: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc ; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_I32_]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -825,13 +825,13 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; MOVREL: $m0 = COPY [[S_MOV_B32_]] - ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] + ; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]] ; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]] ; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v4s32_const_idx ; GPRIDX: [[COPY:%[0-9]+]]:vreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GPRIDX: S_SET_GPR_IDX_ON [[S_MOV_B32_]], 1, implicit-def $m0, implicit-def $mode, implicit $m0, implicit $mode - ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 + ; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0 ; GPRIDX: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:vgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir index df16e9c1f0917..d6d2a74cbcc21 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir @@ -199,6 +199,45 @@ body: | ... +--- +name: extract_sgpr_s32_from_v3s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2 + ; CHECK-LABEL: name: extract_sgpr_s32_from_v3s32 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_96 = COPY $sgpr0_sgpr1_sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub2 + ; CHECK: S_ENDPGM 0, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]] + %0:sgpr(<3 x s32>) = COPY $sgpr0_sgpr1_sgpr2 + %1:sgpr(s32) = G_EXTRACT %0, 0 + %2:sgpr(s32) = G_EXTRACT %0, 32 + %3:sgpr(s32) = G_EXTRACT %0, 64 + S_ENDPGM 0, implicit %0, implicit %2, implicit %3 + +... + +--- +name: extract_sgpr_v2s32_from_v3s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2 + ; CHECK-LABEL: name: extract_sgpr_v2s32_from_v3s32 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_96_with_sub0_sub1 = COPY $sgpr0_sgpr1_sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[COPY]].sub0_sub1 + ; CHECK: S_ENDPGM 0, implicit [[COPY1]] + %0:sgpr(<3 x s32>) = COPY $sgpr0_sgpr1_sgpr2 + %1:sgpr(<2 x s32>) = G_EXTRACT %0, 0 + S_ENDPGM 0, implicit %1 + +... + --- name: extract_sgpr_v3s32_from_v4s32 legalized: true diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir new file mode 100644 index 0000000000000..9afa4b08c0ecb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir @@ -0,0 +1,159 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: fconstant_v_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_v_s32 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]] + %0:vgpr(s32) = G_FCONSTANT float 1.0 + %1:vgpr(s32) = G_FCONSTANT float 8.0 + %2:vgpr(s32) = G_FCONSTANT float 1.0 + %3:vgpr(s32) = G_FCONSTANT float 8.0 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3 +... + +--- +name: fconstant_s_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_s_s32 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1065353216 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1090519040 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3212836864 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 3238002688 + ; GCN: $sgpr0 = COPY [[S_MOV_B32_]] + ; GCN: $sgpr1 = COPY [[S_MOV_B32_1]] + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]] + %0:sgpr(s32) = G_FCONSTANT float 1.0 + %1:sgpr(s32) = G_FCONSTANT float 8.0 + %2:sgpr(s32) = G_FCONSTANT float -1.0 + %3:sgpr(s32) = G_FCONSTANT float -8.0 + $sgpr0 = COPY %0 + $sgpr1 = COPY %1 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3 + +... + +--- +name: fconstant_v_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_v_s64 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248, implicit $exec + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1075838976, implicit $exec + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1073741824, implicit $exec + ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1 + ; GCN: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1076101120, implicit $exec + ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1 + ; GCN: $vgpr0_vgpr1 = COPY [[REG_SEQUENCE]] + ; GCN: $vgpr2_vgpr3 = COPY [[REG_SEQUENCE1]] + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]] + %0:vgpr(s64) = G_FCONSTANT double 1.0 + %1:vgpr(s64) = G_FCONSTANT double 8.0 + %2:vgpr(s64) = G_FCONSTANT double -2.0 + %3:vgpr(s64) = G_FCONSTANT double 10.0 + $vgpr0_vgpr1 = COPY %0 + $vgpr2_vgpr3 = COPY %1 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3 + +... + +--- +name: fconstant_s_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_s_s64 + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4607182418800017408 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1075838976 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GCN: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -4611686018427387904 + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1071382528 + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1 + ; GCN: $sgpr0_sgpr1 = COPY [[S_MOV_B64_]] + ; GCN: $sgpr2_sgpr3 = COPY [[REG_SEQUENCE]] + ; GCN: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_1]], implicit [[REG_SEQUENCE1]] + %0:sgpr(s64) = G_FCONSTANT double 1.0 + %1:sgpr(s64) = G_FCONSTANT double 8.0 + %2:sgpr(s64) = G_FCONSTANT double -2.0 + %3:sgpr(s64) = G_FCONSTANT double -10.0 + $sgpr0_sgpr1 = COPY %0 + $sgpr2_sgpr3 = COPY %1 + S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3 +... + +--- +name: fconstant_v_s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_v_s16 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 18432, implicit $exec + ; GCN: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; GCN: $vgpr1 = COPY [[V_MOV_B32_e32_1]] + %0:vgpr(s16) = G_FCONSTANT half 1.0 + %1:vgpr(s16) = G_FCONSTANT half 8.0 + %2:vgpr(s32) = G_ANYEXT %0 + %3:vgpr(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 + $vgpr1 = COPY %3 + +... + +--- +name: fconstant_s_s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GCN-LABEL: name: fconstant_s_s16 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 15360 + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 18432 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GCN: $sgpr0 = COPY [[COPY]] + ; GCN: $sgpr1 = COPY [[COPY1]] + %0:sgpr(s16) = G_FCONSTANT half 1.0 + %1:sgpr(s16) = G_FCONSTANT half 8.0 + %2:vgpr(s32) = G_ANYEXT %0 + %3:vgpr(s32) = G_ANYEXT %1 + $sgpr0 = COPY %2 + $sgpr1 = COPY %3 + +... + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir index 5e58e8b633ec4..150b341561f97 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.xfail.mir @@ -1,9 +1,13 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s -# FIXME: This should not be legal and this test should be deleted -# ERR: remark: :0:0: cannot select: %3:sgpr(<4 x s16>) = G_INSERT %0:sgpr, %2:sgpr(s16), 0 (in function: insert_sgpr_2s16_to_v4s16_offset0) +# ERR: remark: :0:0: cannot select: %3:sgpr(<4 x s16>) = G_INSERT %0:sgpr, %2:sgpr(s16), 0 (in function: insert_sgpr_s16_to_v4s16_offset0) +# ERR-NEXT: remark: :0:0: cannot select: %2:sgpr(<16 x s32>) = G_INSERT %0:sgpr, %1:sgpr(<8 x s32>), 0 (in function: insert_sgpr_v8s32_to_v16s32_offset0) +# ERR-NOT: remark + +# FIXME: This 16-bit insert source should not be legal and this test +# should be deleted --- -name: insert_sgpr_2s16_to_v4s16_offset0 +name: insert_sgpr_s16_to_v4s16_offset0 legalized: true regBankSelected: true @@ -17,3 +21,19 @@ body: | S_ENDPGM 0, implicit %3 ... + +# getSubRegFromChannel current does not handle cases > 128-bits +--- +name: insert_sgpr_v8s32_to_v16s32_offset0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 + %0:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:sgpr(<8 x s32>) = COPY $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 + %2:sgpr(<16 x s32>) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir index a80ad208b5898..45d74ad38e532 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir @@ -681,7 +681,7 @@ body: | ; GFX7-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX7: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX7: $m0 = S_MOV_B32 -1 ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) @@ -689,14 +689,14 @@ body: | ; GFX9-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] ; GFX6-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: $m0 = S_MOV_B32 -1 ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir index 162dd01de66d1..9f6d10722143d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -311,14 +311,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -347,14 +347,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -453,14 +453,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -489,14 +489,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -597,14 +597,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -633,14 +633,14 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] @@ -860,11 +860,11 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_neg1 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_neg1 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir index 40b7b69f83b7f..7f1f52d2c522a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir @@ -20,7 +20,7 @@ body: | ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_OR_B64_]] ; WAVE32-LABEL: name: or_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -30,7 +30,7 @@ body: | ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_OR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -386,7 +386,7 @@ body: | ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_OR_B64_]] ; WAVE32-LABEL: name: or_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -397,7 +397,7 @@ body: | ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_OR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -425,24 +425,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B32_]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -471,24 +471,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_OR_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_OR_B32_]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir index b4ef0caebfc13..4e7c81f5c79ad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir @@ -17,7 +17,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 @@ -66,7 +66,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 @@ -116,7 +116,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 @@ -165,7 +165,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 @@ -215,7 +215,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 @@ -263,7 +263,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 @@ -314,7 +314,7 @@ body: | ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY]], [[S_MOV_B32_]], implicit $exec ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 @@ -363,7 +363,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 @@ -412,7 +412,7 @@ body: | ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: S_CMP_EQ_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $scc + ; GCN: [[COPY3:%[0-9]+]]:sreg_32 = COPY $scc ; GCN: $scc = COPY [[COPY3]] ; GCN: S_CBRANCH_SCC1 %bb.1, implicit $scc ; GCN: S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir index bbb885c705ed6..a7f875fcdd428 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir @@ -33,7 +33,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_0xf0f0f0f0 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4042322160 + ; CHECK: %const:sreg_32 = S_MOV_B32 -252645136 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -54,7 +54,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_0xffffffff ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967295 + ; CHECK: %const:sreg_32 = S_MOV_B32 -1 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -96,7 +96,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearhi1 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 2147483648 + ; CHECK: %const:sreg_32 = S_MOV_B32 -2147483648 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -117,7 +117,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearhi2 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 3221225472 + ; CHECK: %const:sreg_32 = S_MOV_B32 -1073741824 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -138,7 +138,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo1 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967294 + ; CHECK: %const:sreg_32 = S_MOV_B32 -2 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -159,7 +159,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo2 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967292 + ; CHECK: %const:sreg_32 = S_MOV_B32 -4 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -180,7 +180,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo3 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967288 + ; CHECK: %const:sreg_32 = S_MOV_B32 -8 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -201,7 +201,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo4 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 4294967280 + ; CHECK: %const:sreg_32 = S_MOV_B32 -16 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -222,7 +222,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_s32_sgpr_sgpr_clearlo29 ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: %const:sreg_32 = S_MOV_B32 3758096384 + ; CHECK: %const:sreg_32 = S_MOV_B32 -536870912 ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], %const, implicit-def $scc ; CHECK: S_ENDPGM 0, implicit [[S_AND_B32_]] %0:sgpr(p3) = COPY $sgpr0 @@ -560,7 +560,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_0xf0f0f0f0 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4042322160, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -252645136, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -581,7 +581,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo1 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4294967294, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -602,7 +602,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo2 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -623,7 +623,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo3 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4294967288, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -8, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -644,7 +644,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo4 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 4294967280, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 @@ -665,7 +665,7 @@ body: | ; CHECK-LABEL: name: ptrmask_p3_vgpr_vgpr_clearlo29 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 3758096384, implicit $exec + ; CHECK: %const:vgpr_32 = V_MOV_B32_e32 -536870912, implicit $exec ; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], %const, implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] %0:vgpr(p3) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir index 1c03557cc4953..f923a4c9f02b8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir @@ -20,7 +20,7 @@ body: | ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE64: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_XOR_B64_]] ; WAVE32-LABEL: name: xor_s1_vcc_vcc_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -30,7 +30,7 @@ body: | ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; WAVE32: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec - ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]] + ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_XOR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -387,7 +387,7 @@ body: | ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: S_ENDPGM 0, implicit [[S_XOR_B64_]] ; WAVE32-LABEL: name: xor_s1_vcc_copy_to_vcc ; WAVE32: liveins: $vgpr0, $vgpr1 @@ -398,7 +398,7 @@ body: | ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec - ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: S_ENDPGM 0, implicit [[S_XOR_B32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -426,24 +426,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B32_]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 @@ -472,24 +472,24 @@ body: | ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE64: liveins: $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_XOR_B64_]] ; WAVE64: S_ENDPGM 0, implicit [[COPY1]] ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64 ; WAVE32: liveins: $vgpr0 ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec - ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]] + ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_XOR_B32_]] ; WAVE32: S_ENDPGM 0, implicit [[COPY1]] %1:vgpr(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index 07b8dced57132..1315f1bc275eb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -326,4 +326,13 @@ entry: ret i32 %asm1 } +define amdgpu_kernel void @asm_constraint_n_n() { + ; CHECK-LABEL: name: asm_constraint_n_n + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: INLINEASM &"s_trap ${0:n}", 1 /* sideeffect attdialect */, 13 /* imm */, 10 + ; CHECK: S_ENDPGM 0 + tail call void asm sideeffect "s_trap ${0:n}", "n"(i32 10) #1 + ret void +} + !0 = !{i32 70} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll new file mode 100644 index 0000000000000..6a3b3b8128661 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -stop-after=irtranslator -o - %s | FileCheck %s +; Make sure intrinsics with metadata arguments are translated + +define i32 @reloc_constant() { + ; CHECK-LABEL: name: reloc_constant + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0 + ; CHECK: $vgpr0 = COPY [[INT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + %val = call i32 @llvm.amdgcn.reloc.constant(metadata !0) + ret i32 %val +} + +declare i32 @llvm.amdgcn.reloc.constant(metadata) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } + +!0 = !{!"arst"} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir index 3d26cefc4f55d..ef7e4e2f15a2e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir @@ -361,15 +361,18 @@ body: | ; CHECK-LABEL: name: test_and_v3s16 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CHECK: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[INSERT]], [[INSERT1]] - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[AND]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT2]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[AND]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF @@ -403,27 +406,32 @@ body: | ; CHECK-LABEL: name: test_and_v5s16 ; CHECK: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[DEF]](<6 x s16>), 0 ; CHECK: [[DEF1:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[DEF1]](<6 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[DEF]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>), [[UV2:%[0-9]+]]:_(<5 x s16>), [[UV3:%[0-9]+]]:_(<5 x s16>), [[UV4:%[0-9]+]]:_(<5 x s16>), [[UV5:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<30 x s16>) ; CHECK: [[DEF2:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<5 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<5 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT]](<6 x s16>) - ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[DEF2]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV6:%[0-9]+]]:_(<5 x s16>), [[UV7:%[0-9]+]]:_(<5 x s16>), [[UV8:%[0-9]+]]:_(<5 x s16>), [[UV9:%[0-9]+]]:_(<5 x s16>), [[UV10:%[0-9]+]]:_(<5 x s16>), [[UV11:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<30 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[UV]](<5 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[UV6]](<5 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT]](<6 x s16>) + ; CHECK: [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) ; CHECK: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV]](<3 x s16>), 0 - ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV12]](<3 x s16>), 0 + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV14]](<3 x s16>), 0 ; CHECK: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[INSERT2]], [[INSERT3]] - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[AND]](<4 x s16>), 0 - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV1]](<3 x s16>), 0 - ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV3]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[AND]](<4 x s16>), [[DEF3]](<4 x s16>), [[DEF3]](<4 x s16>) + ; CHECK: [[UV16:%[0-9]+]]:_(<3 x s16>), [[UV17:%[0-9]+]]:_(<3 x s16>), [[UV18:%[0-9]+]]:_(<3 x s16>), [[UV19:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV13]](<3 x s16>), 0 + ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV15]](<3 x s16>), 0 ; CHECK: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[INSERT4]], [[INSERT5]] - ; CHECK: [[EXTRACT3:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[AND1]](<4 x s16>), 0 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT2]](<3 x s16>), [[EXTRACT3]](<3 x s16>) - ; CHECK: [[EXTRACT4:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[AND1]](<4 x s16>), [[DEF3]](<4 x s16>), [[DEF3]](<4 x s16>) + ; CHECK: [[UV20:%[0-9]+]]:_(<3 x s16>), [[UV21:%[0-9]+]]:_(<3 x s16>), [[UV22:%[0-9]+]]:_(<3 x s16>), [[UV23:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; CHECK: [[CONCAT_VECTORS4:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV16]](<3 x s16>), [[UV20]](<3 x s16>) + ; CHECK: [[CONCAT_VECTORS5:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS4]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV24:%[0-9]+]]:_(<5 x s16>), [[UV25:%[0-9]+]]:_(<5 x s16>), [[UV26:%[0-9]+]]:_(<5 x s16>), [[UV27:%[0-9]+]]:_(<5 x s16>), [[UV28:%[0-9]+]]:_(<5 x s16>), [[UV29:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS5]](<30 x s16>) ; CHECK: [[DEF4:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT6:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF4]], [[EXTRACT4]](<5 x s16>), 0 + ; CHECK: [[INSERT6:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF4]], [[UV24]](<5 x s16>), 0 ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT6]](<8 x s16>) %0:_(<5 x s16>) = G_IMPLICIT_DEF %1:_(<5 x s16>) = G_IMPLICIT_DEF @@ -463,8 +471,9 @@ body: | ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[ANYEXT7]] ; CHECK: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[AND3]](s32) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s8>) = G_EXTRACT [[BUILD_VECTOR]](<4 x s8>), 0 - ; CHECK: [[ANYEXT8:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[EXTRACT]](<3 x s8>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s8>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<4 x s8>), [[DEF2]](<4 x s8>), [[DEF2]](<4 x s8>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s8>), [[UV9:%[0-9]+]]:_(<3 x s8>), [[UV10:%[0-9]+]]:_(<3 x s8>), [[UV11:%[0-9]+]]:_(<3 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s8>) + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[UV8]](<3 x s8>) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[ANYEXT8]](<3 x s32>) %0:_(<3 x s8>) = G_IMPLICIT_DEF %1:_(<3 x s8>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir index c4d511ddda3ca..a39e97cf3309e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir @@ -863,14 +863,17 @@ body: | ; GFX9: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[EXTRACT2]], [[EXTRACT4]](<2 x s16>) ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[EXTRACT3]], [[EXTRACT5]](s16) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT6:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT6]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT4]], [[ASHR]](<2 x s16>), 0 - ; GFX9: [[EXTRACT7:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT5]](<4 x s16>), 0 - ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT7]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT5]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT7:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT6]], [[ASHR1]](s16), 32 - ; GFX9: [[EXTRACT8:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT7]](<4 x s16>), 0 - ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT8]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT7]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT8]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir index 6490e9a603b79..b548ff5503435 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s --- name: extract_vector_elt_0_v2i32 @@ -228,8 +228,10 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_0_v2i16_i32 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[DEF]](<2 x s16>) - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; CHECK: $vgpr0 = COPY [[COPY]](s32) %0:_(<2 x s16>) = G_IMPLICIT_DEF %1:_(s32) = G_CONSTANT i32 0 @@ -290,25 +292,155 @@ name: extract_vector_elt_v2s8_varidx_i32 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2 + liveins: $vgpr0, $vgpr1 ; CHECK-LABEL: name: extract_vector_elt_v2s8_varidx_i32 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[COPY]](<2 x s32>) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV]], 8 - ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV1]], 8 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32) ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[COPY1]](s32) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) - ; CHECK: $vgpr0 = COPY [[COPY3]](s32) - %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 - %1:_(s32) = COPY $vgpr2 - %2:_(<2 x s8>) = G_TRUNC %0 - %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 - %4:_(s32) = G_ANYEXT %3 - $vgpr0 = COPY %4 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(<2 x s8>) = G_BITCAST %2 + %4:_(s8) = G_EXTRACT_VECTOR_ELT %3, %1 + %5:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 +... + +--- +name: extract_vector_elt_v2s8_constidx_0_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v2s8_constidx_0_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BUILD_VECTOR]](<2 x s32>), 0 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EXTRACT]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(<2 x s8>) = G_BITCAST %2 + %4:_(s32) = G_CONSTANT i32 0 + %5:_(s8) = G_EXTRACT_VECTOR_ELT %3, %4 + %6:_(s32) = G_ANYEXT %5 + $vgpr0 = COPY %6 +... + +--- +name: extract_vector_elt_v2s8_constidx_1_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v2s8_constidx_1_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BUILD_VECTOR]](<2 x s32>), 32 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EXTRACT]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(<2 x s8>) = G_BITCAST %2 + %4:_(s32) = G_CONSTANT i32 1 + %5:_(s8) = G_EXTRACT_VECTOR_ELT %3, %4 + %6:_(s32) = G_ANYEXT %5 + $vgpr0 = COPY %6 +... + +--- +name: extract_vector_elt_v4s4_varidx_i32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v4s4_varidx_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C4]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C5]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C6]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 4 + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY7]], 4 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY8]], 4 + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; CHECK: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY9]], 4 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) + ; CHECK: $vgpr0 = COPY [[COPY10]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(<4 x s4>) = G_BITCAST %2 + %4:_(s4) = G_EXTRACT_VECTOR_ELT %3, %1 + %5:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 ... --- @@ -343,24 +475,559 @@ name: extract_vector_elt_v4s8_varidx_i32 body: | bb.0: - liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + liveins: $vgpr0, $vgpr1 ; CHECK-LABEL: name: extract_vector_elt_v4s8_varidx_i32 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>) - ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV]], 8 - ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV1]], 8 - ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV2]], 8 - ; CHECK: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 8 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32) - ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) - ; CHECK: $vgpr0 = COPY [[COPY3]](s32) - %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - %1:_(s32) = COPY $vgpr4 - %2:_(<4 x s8>) = G_TRUNC %0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[SHL3]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<4 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v4s8_constidx_0_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_0_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C4]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C4]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C4]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C3]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v4s8_constidx_1_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_1_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s32) = G_CONSTANT i32 1 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v4s8_constidx_2_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_2_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v4s8_constidx_3_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_3_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C2]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s32) = G_CONSTANT i32 3 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + + + +--- +name: extract_vector_elt_v8s8_varidx_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_varidx_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C3]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]] + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C5]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[LSHR6]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C6]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C6]](s32) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL6]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + + +--- +name: extract_vector_elt_v8s8_constidx_0_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_0_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C2]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C2]](s16) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_1_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_1_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 1 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_3_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_3_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C4]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 3 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_4_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_4_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C5]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 4 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_5_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_5_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 5 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_7_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_7_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C4]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 7 + %2:_(<8 x s8>) = G_BITCAST %0 %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 %4:_(s32) = G_ANYEXT %3 $vgpr0 = COPY %4 @@ -376,9 +1043,14 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s16_varidx_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s16>), [[COPY1]](s32) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[SHL]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY2]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -395,8 +1067,10 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx0_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; CHECK: $vgpr0 = COPY [[COPY1]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 0 @@ -435,8 +1109,11 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx2_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK: $vgpr0 = COPY [[DEF]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 2 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -579,9 +1256,16 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v4s16_varidx_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; CHECK: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[COPY1]](s32) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<2 x s32>), [[LSHR]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: $vgpr0 = COPY [[COPY2]](s32) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -599,8 +1283,18 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s128_varidx_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s128>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; CHECK: [[EVEC:%[0-9]+]]:_(s128) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s128>), [[COPY1]](s32) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[EVEC]](s128) + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s64>) = G_BITCAST [[COPY]](<2 x s128>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[C1]] + ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<4 x s64>), [[ADD]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[C2]] + ; CHECK: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<4 x s64>), [[ADD1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[EVEC]](s64), [[EVEC1]](s64) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<2 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST1]](s128) %0:_(<2 x s128>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 %1:_(s32) = COPY $vgpr8 %2:_(s128) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -694,3 +1388,787 @@ body: | %3:_(s32) = G_EXTRACT_VECTOR_ELT %0, %2 $vgpr0 = COPY %3 ... + +--- +name: extract_vector_elt_7_v64s32 + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_7_v64s32 + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load 64, align 4, addrspace 4) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 64 + 64, align 4, addrspace 4) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 128 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; CHECK: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 64 + 128, align 4, addrspace 4) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 192 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4) + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<16 x s32>) + ; CHECK: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>) + ; CHECK: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD2]](<16 x s32>) + ; CHECK: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD3]](<16 x s32>) + ; CHECK: G_STORE [[UV]](s32), [[FRAME_INDEX]](p5) :: (store 4 into %stack.0, align 256, addrspace 5) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C3]](s32) + ; CHECK: G_STORE [[UV1]](s32), [[PTR_ADD3]](p5) :: (store 4 into %stack.0 + 4, align 256, addrspace 5) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C4]](s32) + ; CHECK: G_STORE [[UV2]](s32), [[PTR_ADD4]](p5) :: (store 4 into %stack.0 + 8, align 256, addrspace 5) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C5]](s32) + ; CHECK: G_STORE [[UV3]](s32), [[PTR_ADD5]](p5) :: (store 4 into %stack.0 + 12, align 256, addrspace 5) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s32) + ; CHECK: G_STORE [[UV4]](s32), [[PTR_ADD6]](p5) :: (store 4 into %stack.0 + 16, align 256, addrspace 5) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C7]](s32) + ; CHECK: G_STORE [[UV5]](s32), [[PTR_ADD7]](p5) :: (store 4 into %stack.0 + 20, align 256, addrspace 5) + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s32) + ; CHECK: G_STORE [[UV6]](s32), [[PTR_ADD8]](p5) :: (store 4 into %stack.0 + 24, align 256, addrspace 5) + ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C9]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(p5) = COPY [[PTR_ADD9]](p5) + ; CHECK: G_STORE [[UV7]](s32), [[COPY1]](p5) :: (store 4 into %stack.0 + 28, align 256, addrspace 5) + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s32) + ; CHECK: G_STORE [[UV8]](s32), [[PTR_ADD10]](p5) :: (store 4 into %stack.0 + 32, align 256, addrspace 5) + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 36 + ; CHECK: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C11]](s32) + ; CHECK: G_STORE [[UV9]](s32), [[PTR_ADD11]](p5) :: (store 4 into %stack.0 + 36, align 256, addrspace 5) + ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 40 + ; CHECK: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C12]](s32) + ; CHECK: G_STORE [[UV10]](s32), [[PTR_ADD12]](p5) :: (store 4 into %stack.0 + 40, align 256, addrspace 5) + ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 44 + ; CHECK: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C13]](s32) + ; CHECK: G_STORE [[UV11]](s32), [[PTR_ADD13]](p5) :: (store 4 into %stack.0 + 44, align 256, addrspace 5) + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 48 + ; CHECK: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C14]](s32) + ; CHECK: G_STORE [[UV12]](s32), [[PTR_ADD14]](p5) :: (store 4 into %stack.0 + 48, align 256, addrspace 5) + ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 52 + ; CHECK: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C15]](s32) + ; CHECK: G_STORE [[UV13]](s32), [[PTR_ADD15]](p5) :: (store 4 into %stack.0 + 52, align 256, addrspace 5) + ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 56 + ; CHECK: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C16]](s32) + ; CHECK: G_STORE [[UV14]](s32), [[PTR_ADD16]](p5) :: (store 4 into %stack.0 + 56, align 256, addrspace 5) + ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 60 + ; CHECK: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C17]](s32) + ; CHECK: G_STORE [[UV15]](s32), [[PTR_ADD17]](p5) :: (store 4 into %stack.0 + 60, align 256, addrspace 5) + ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C18]](s32) + ; CHECK: G_STORE [[UV16]](s32), [[PTR_ADD18]](p5) :: (store 4 into %stack.0 + 64, align 256, addrspace 5) + ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 68 + ; CHECK: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C19]](s32) + ; CHECK: G_STORE [[UV17]](s32), [[PTR_ADD19]](p5) :: (store 4 into %stack.0 + 68, align 256, addrspace 5) + ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 72 + ; CHECK: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C20]](s32) + ; CHECK: G_STORE [[UV18]](s32), [[PTR_ADD20]](p5) :: (store 4 into %stack.0 + 72, align 256, addrspace 5) + ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 76 + ; CHECK: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C21]](s32) + ; CHECK: G_STORE [[UV19]](s32), [[PTR_ADD21]](p5) :: (store 4 into %stack.0 + 76, align 256, addrspace 5) + ; CHECK: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 80 + ; CHECK: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C22]](s32) + ; CHECK: G_STORE [[UV20]](s32), [[PTR_ADD22]](p5) :: (store 4 into %stack.0 + 80, align 256, addrspace 5) + ; CHECK: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 84 + ; CHECK: [[PTR_ADD23:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C23]](s32) + ; CHECK: G_STORE [[UV21]](s32), [[PTR_ADD23]](p5) :: (store 4 into %stack.0 + 84, align 256, addrspace 5) + ; CHECK: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 88 + ; CHECK: [[PTR_ADD24:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C24]](s32) + ; CHECK: G_STORE [[UV22]](s32), [[PTR_ADD24]](p5) :: (store 4 into %stack.0 + 88, align 256, addrspace 5) + ; CHECK: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 92 + ; CHECK: [[PTR_ADD25:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C25]](s32) + ; CHECK: G_STORE [[UV23]](s32), [[PTR_ADD25]](p5) :: (store 4 into %stack.0 + 92, align 256, addrspace 5) + ; CHECK: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 96 + ; CHECK: [[PTR_ADD26:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C26]](s32) + ; CHECK: G_STORE [[UV24]](s32), [[PTR_ADD26]](p5) :: (store 4 into %stack.0 + 96, align 256, addrspace 5) + ; CHECK: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 + ; CHECK: [[PTR_ADD27:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C27]](s32) + ; CHECK: G_STORE [[UV25]](s32), [[PTR_ADD27]](p5) :: (store 4 into %stack.0 + 100, align 256, addrspace 5) + ; CHECK: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 104 + ; CHECK: [[PTR_ADD28:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C28]](s32) + ; CHECK: G_STORE [[UV26]](s32), [[PTR_ADD28]](p5) :: (store 4 into %stack.0 + 104, align 256, addrspace 5) + ; CHECK: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 108 + ; CHECK: [[PTR_ADD29:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C29]](s32) + ; CHECK: G_STORE [[UV27]](s32), [[PTR_ADD29]](p5) :: (store 4 into %stack.0 + 108, align 256, addrspace 5) + ; CHECK: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 112 + ; CHECK: [[PTR_ADD30:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C30]](s32) + ; CHECK: G_STORE [[UV28]](s32), [[PTR_ADD30]](p5) :: (store 4 into %stack.0 + 112, align 256, addrspace 5) + ; CHECK: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 116 + ; CHECK: [[PTR_ADD31:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C31]](s32) + ; CHECK: G_STORE [[UV29]](s32), [[PTR_ADD31]](p5) :: (store 4 into %stack.0 + 116, align 256, addrspace 5) + ; CHECK: [[C32:%[0-9]+]]:_(s32) = G_CONSTANT i32 120 + ; CHECK: [[PTR_ADD32:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C32]](s32) + ; CHECK: G_STORE [[UV30]](s32), [[PTR_ADD32]](p5) :: (store 4 into %stack.0 + 120, align 256, addrspace 5) + ; CHECK: [[C33:%[0-9]+]]:_(s32) = G_CONSTANT i32 124 + ; CHECK: [[PTR_ADD33:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C33]](s32) + ; CHECK: G_STORE [[UV31]](s32), [[PTR_ADD33]](p5) :: (store 4 into %stack.0 + 124, align 256, addrspace 5) + ; CHECK: [[C34:%[0-9]+]]:_(s32) = G_CONSTANT i32 128 + ; CHECK: [[PTR_ADD34:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C34]](s32) + ; CHECK: G_STORE [[UV32]](s32), [[PTR_ADD34]](p5) :: (store 4 into %stack.0 + 128, align 256, addrspace 5) + ; CHECK: [[C35:%[0-9]+]]:_(s32) = G_CONSTANT i32 132 + ; CHECK: [[PTR_ADD35:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C35]](s32) + ; CHECK: G_STORE [[UV33]](s32), [[PTR_ADD35]](p5) :: (store 4 into %stack.0 + 132, align 256, addrspace 5) + ; CHECK: [[C36:%[0-9]+]]:_(s32) = G_CONSTANT i32 136 + ; CHECK: [[PTR_ADD36:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C36]](s32) + ; CHECK: G_STORE [[UV34]](s32), [[PTR_ADD36]](p5) :: (store 4 into %stack.0 + 136, align 256, addrspace 5) + ; CHECK: [[C37:%[0-9]+]]:_(s32) = G_CONSTANT i32 140 + ; CHECK: [[PTR_ADD37:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C37]](s32) + ; CHECK: G_STORE [[UV35]](s32), [[PTR_ADD37]](p5) :: (store 4 into %stack.0 + 140, align 256, addrspace 5) + ; CHECK: [[C38:%[0-9]+]]:_(s32) = G_CONSTANT i32 144 + ; CHECK: [[PTR_ADD38:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C38]](s32) + ; CHECK: G_STORE [[UV36]](s32), [[PTR_ADD38]](p5) :: (store 4 into %stack.0 + 144, align 256, addrspace 5) + ; CHECK: [[C39:%[0-9]+]]:_(s32) = G_CONSTANT i32 148 + ; CHECK: [[PTR_ADD39:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C39]](s32) + ; CHECK: G_STORE [[UV37]](s32), [[PTR_ADD39]](p5) :: (store 4 into %stack.0 + 148, align 256, addrspace 5) + ; CHECK: [[C40:%[0-9]+]]:_(s32) = G_CONSTANT i32 152 + ; CHECK: [[PTR_ADD40:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C40]](s32) + ; CHECK: G_STORE [[UV38]](s32), [[PTR_ADD40]](p5) :: (store 4 into %stack.0 + 152, align 256, addrspace 5) + ; CHECK: [[C41:%[0-9]+]]:_(s32) = G_CONSTANT i32 156 + ; CHECK: [[PTR_ADD41:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C41]](s32) + ; CHECK: G_STORE [[UV39]](s32), [[PTR_ADD41]](p5) :: (store 4 into %stack.0 + 156, align 256, addrspace 5) + ; CHECK: [[C42:%[0-9]+]]:_(s32) = G_CONSTANT i32 160 + ; CHECK: [[PTR_ADD42:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C42]](s32) + ; CHECK: G_STORE [[UV40]](s32), [[PTR_ADD42]](p5) :: (store 4 into %stack.0 + 160, align 256, addrspace 5) + ; CHECK: [[C43:%[0-9]+]]:_(s32) = G_CONSTANT i32 164 + ; CHECK: [[PTR_ADD43:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C43]](s32) + ; CHECK: G_STORE [[UV41]](s32), [[PTR_ADD43]](p5) :: (store 4 into %stack.0 + 164, align 256, addrspace 5) + ; CHECK: [[C44:%[0-9]+]]:_(s32) = G_CONSTANT i32 168 + ; CHECK: [[PTR_ADD44:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C44]](s32) + ; CHECK: G_STORE [[UV42]](s32), [[PTR_ADD44]](p5) :: (store 4 into %stack.0 + 168, align 256, addrspace 5) + ; CHECK: [[C45:%[0-9]+]]:_(s32) = G_CONSTANT i32 172 + ; CHECK: [[PTR_ADD45:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C45]](s32) + ; CHECK: G_STORE [[UV43]](s32), [[PTR_ADD45]](p5) :: (store 4 into %stack.0 + 172, align 256, addrspace 5) + ; CHECK: [[C46:%[0-9]+]]:_(s32) = G_CONSTANT i32 176 + ; CHECK: [[PTR_ADD46:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C46]](s32) + ; CHECK: G_STORE [[UV44]](s32), [[PTR_ADD46]](p5) :: (store 4 into %stack.0 + 176, align 256, addrspace 5) + ; CHECK: [[C47:%[0-9]+]]:_(s32) = G_CONSTANT i32 180 + ; CHECK: [[PTR_ADD47:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C47]](s32) + ; CHECK: G_STORE [[UV45]](s32), [[PTR_ADD47]](p5) :: (store 4 into %stack.0 + 180, align 256, addrspace 5) + ; CHECK: [[C48:%[0-9]+]]:_(s32) = G_CONSTANT i32 184 + ; CHECK: [[PTR_ADD48:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C48]](s32) + ; CHECK: G_STORE [[UV46]](s32), [[PTR_ADD48]](p5) :: (store 4 into %stack.0 + 184, align 256, addrspace 5) + ; CHECK: [[C49:%[0-9]+]]:_(s32) = G_CONSTANT i32 188 + ; CHECK: [[PTR_ADD49:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C49]](s32) + ; CHECK: G_STORE [[UV47]](s32), [[PTR_ADD49]](p5) :: (store 4 into %stack.0 + 188, align 256, addrspace 5) + ; CHECK: [[C50:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 + ; CHECK: [[PTR_ADD50:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C50]](s32) + ; CHECK: G_STORE [[UV48]](s32), [[PTR_ADD50]](p5) :: (store 4 into %stack.0 + 192, align 256, addrspace 5) + ; CHECK: [[C51:%[0-9]+]]:_(s32) = G_CONSTANT i32 196 + ; CHECK: [[PTR_ADD51:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C51]](s32) + ; CHECK: G_STORE [[UV49]](s32), [[PTR_ADD51]](p5) :: (store 4 into %stack.0 + 196, align 256, addrspace 5) + ; CHECK: [[C52:%[0-9]+]]:_(s32) = G_CONSTANT i32 200 + ; CHECK: [[PTR_ADD52:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C52]](s32) + ; CHECK: G_STORE [[UV50]](s32), [[PTR_ADD52]](p5) :: (store 4 into %stack.0 + 200, align 256, addrspace 5) + ; CHECK: [[C53:%[0-9]+]]:_(s32) = G_CONSTANT i32 204 + ; CHECK: [[PTR_ADD53:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C53]](s32) + ; CHECK: G_STORE [[UV51]](s32), [[PTR_ADD53]](p5) :: (store 4 into %stack.0 + 204, align 256, addrspace 5) + ; CHECK: [[C54:%[0-9]+]]:_(s32) = G_CONSTANT i32 208 + ; CHECK: [[PTR_ADD54:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C54]](s32) + ; CHECK: G_STORE [[UV52]](s32), [[PTR_ADD54]](p5) :: (store 4 into %stack.0 + 208, align 256, addrspace 5) + ; CHECK: [[C55:%[0-9]+]]:_(s32) = G_CONSTANT i32 212 + ; CHECK: [[PTR_ADD55:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C55]](s32) + ; CHECK: G_STORE [[UV53]](s32), [[PTR_ADD55]](p5) :: (store 4 into %stack.0 + 212, align 256, addrspace 5) + ; CHECK: [[C56:%[0-9]+]]:_(s32) = G_CONSTANT i32 216 + ; CHECK: [[PTR_ADD56:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C56]](s32) + ; CHECK: G_STORE [[UV54]](s32), [[PTR_ADD56]](p5) :: (store 4 into %stack.0 + 216, align 256, addrspace 5) + ; CHECK: [[C57:%[0-9]+]]:_(s32) = G_CONSTANT i32 220 + ; CHECK: [[PTR_ADD57:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C57]](s32) + ; CHECK: G_STORE [[UV55]](s32), [[PTR_ADD57]](p5) :: (store 4 into %stack.0 + 220, align 256, addrspace 5) + ; CHECK: [[C58:%[0-9]+]]:_(s32) = G_CONSTANT i32 224 + ; CHECK: [[PTR_ADD58:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C58]](s32) + ; CHECK: G_STORE [[UV56]](s32), [[PTR_ADD58]](p5) :: (store 4 into %stack.0 + 224, align 256, addrspace 5) + ; CHECK: [[C59:%[0-9]+]]:_(s32) = G_CONSTANT i32 228 + ; CHECK: [[PTR_ADD59:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C59]](s32) + ; CHECK: G_STORE [[UV57]](s32), [[PTR_ADD59]](p5) :: (store 4 into %stack.0 + 228, align 256, addrspace 5) + ; CHECK: [[C60:%[0-9]+]]:_(s32) = G_CONSTANT i32 232 + ; CHECK: [[PTR_ADD60:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C60]](s32) + ; CHECK: G_STORE [[UV58]](s32), [[PTR_ADD60]](p5) :: (store 4 into %stack.0 + 232, align 256, addrspace 5) + ; CHECK: [[C61:%[0-9]+]]:_(s32) = G_CONSTANT i32 236 + ; CHECK: [[PTR_ADD61:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C61]](s32) + ; CHECK: G_STORE [[UV59]](s32), [[PTR_ADD61]](p5) :: (store 4 into %stack.0 + 236, align 256, addrspace 5) + ; CHECK: [[C62:%[0-9]+]]:_(s32) = G_CONSTANT i32 240 + ; CHECK: [[PTR_ADD62:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C62]](s32) + ; CHECK: G_STORE [[UV60]](s32), [[PTR_ADD62]](p5) :: (store 4 into %stack.0 + 240, align 256, addrspace 5) + ; CHECK: [[C63:%[0-9]+]]:_(s32) = G_CONSTANT i32 244 + ; CHECK: [[PTR_ADD63:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C63]](s32) + ; CHECK: G_STORE [[UV61]](s32), [[PTR_ADD63]](p5) :: (store 4 into %stack.0 + 244, align 256, addrspace 5) + ; CHECK: [[C64:%[0-9]+]]:_(s32) = G_CONSTANT i32 248 + ; CHECK: [[PTR_ADD64:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C64]](s32) + ; CHECK: G_STORE [[UV62]](s32), [[PTR_ADD64]](p5) :: (store 4 into %stack.0 + 248, align 256, addrspace 5) + ; CHECK: [[C65:%[0-9]+]]:_(s32) = G_CONSTANT i32 252 + ; CHECK: [[PTR_ADD65:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C65]](s32) + ; CHECK: G_STORE [[UV63]](s32), [[PTR_ADD65]](p5) :: (store 4 into %stack.0 + 252, align 256, addrspace 5) + ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p5) :: (load 4 from %stack.0 + 28, addrspace 5) + ; CHECK: S_ENDPGM 0, implicit [[LOAD4]](s32) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_CONSTANT i32 7 + %2:_(<64 x s32>) = G_LOAD %0 :: (load 256, align 4, addrspace 4) + %3:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1 + S_ENDPGM 0, implicit %3 +... + +--- +name: extract_vector_elt_varidx_v64s32 + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2 + + ; CHECK-LABEL: name: extract_vector_elt_varidx_v64s32 + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load 64, align 4, addrspace 4) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 64 + 64, align 4, addrspace 4) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 128 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; CHECK: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 64 + 128, align 4, addrspace 4) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 192 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4) + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<16 x s32>) + ; CHECK: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>) + ; CHECK: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD2]](<16 x s32>) + ; CHECK: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD3]](<16 x s32>) + ; CHECK: G_STORE [[UV]](s32), [[FRAME_INDEX]](p5) :: (store 4 into %stack.0, align 256, addrspace 5) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C3]](s32) + ; CHECK: G_STORE [[UV1]](s32), [[PTR_ADD3]](p5) :: (store 4 into %stack.0 + 4, align 256, addrspace 5) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C4]](s32) + ; CHECK: G_STORE [[UV2]](s32), [[PTR_ADD4]](p5) :: (store 4 into %stack.0 + 8, align 256, addrspace 5) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C5]](s32) + ; CHECK: G_STORE [[UV3]](s32), [[PTR_ADD5]](p5) :: (store 4 into %stack.0 + 12, align 256, addrspace 5) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s32) + ; CHECK: G_STORE [[UV4]](s32), [[PTR_ADD6]](p5) :: (store 4 into %stack.0 + 16, align 256, addrspace 5) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C7]](s32) + ; CHECK: G_STORE [[UV5]](s32), [[PTR_ADD7]](p5) :: (store 4 into %stack.0 + 20, align 256, addrspace 5) + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s32) + ; CHECK: G_STORE [[UV6]](s32), [[PTR_ADD8]](p5) :: (store 4 into %stack.0 + 24, align 256, addrspace 5) + ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C9]](s32) + ; CHECK: G_STORE [[UV7]](s32), [[PTR_ADD9]](p5) :: (store 4 into %stack.0 + 28, align 256, addrspace 5) + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s32) + ; CHECK: G_STORE [[UV8]](s32), [[PTR_ADD10]](p5) :: (store 4 into %stack.0 + 32, align 256, addrspace 5) + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 36 + ; CHECK: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C11]](s32) + ; CHECK: G_STORE [[UV9]](s32), [[PTR_ADD11]](p5) :: (store 4 into %stack.0 + 36, align 256, addrspace 5) + ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 40 + ; CHECK: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C12]](s32) + ; CHECK: G_STORE [[UV10]](s32), [[PTR_ADD12]](p5) :: (store 4 into %stack.0 + 40, align 256, addrspace 5) + ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 44 + ; CHECK: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C13]](s32) + ; CHECK: G_STORE [[UV11]](s32), [[PTR_ADD13]](p5) :: (store 4 into %stack.0 + 44, align 256, addrspace 5) + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 48 + ; CHECK: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C14]](s32) + ; CHECK: G_STORE [[UV12]](s32), [[PTR_ADD14]](p5) :: (store 4 into %stack.0 + 48, align 256, addrspace 5) + ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 52 + ; CHECK: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C15]](s32) + ; CHECK: G_STORE [[UV13]](s32), [[PTR_ADD15]](p5) :: (store 4 into %stack.0 + 52, align 256, addrspace 5) + ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 56 + ; CHECK: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C16]](s32) + ; CHECK: G_STORE [[UV14]](s32), [[PTR_ADD16]](p5) :: (store 4 into %stack.0 + 56, align 256, addrspace 5) + ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 60 + ; CHECK: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C17]](s32) + ; CHECK: G_STORE [[UV15]](s32), [[PTR_ADD17]](p5) :: (store 4 into %stack.0 + 60, align 256, addrspace 5) + ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C18]](s32) + ; CHECK: G_STORE [[UV16]](s32), [[PTR_ADD18]](p5) :: (store 4 into %stack.0 + 64, align 256, addrspace 5) + ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 68 + ; CHECK: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C19]](s32) + ; CHECK: G_STORE [[UV17]](s32), [[PTR_ADD19]](p5) :: (store 4 into %stack.0 + 68, align 256, addrspace 5) + ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 72 + ; CHECK: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C20]](s32) + ; CHECK: G_STORE [[UV18]](s32), [[PTR_ADD20]](p5) :: (store 4 into %stack.0 + 72, align 256, addrspace 5) + ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 76 + ; CHECK: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C21]](s32) + ; CHECK: G_STORE [[UV19]](s32), [[PTR_ADD21]](p5) :: (store 4 into %stack.0 + 76, align 256, addrspace 5) + ; CHECK: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 80 + ; CHECK: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C22]](s32) + ; CHECK: G_STORE [[UV20]](s32), [[PTR_ADD22]](p5) :: (store 4 into %stack.0 + 80, align 256, addrspace 5) + ; CHECK: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 84 + ; CHECK: [[PTR_ADD23:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C23]](s32) + ; CHECK: G_STORE [[UV21]](s32), [[PTR_ADD23]](p5) :: (store 4 into %stack.0 + 84, align 256, addrspace 5) + ; CHECK: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 88 + ; CHECK: [[PTR_ADD24:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C24]](s32) + ; CHECK: G_STORE [[UV22]](s32), [[PTR_ADD24]](p5) :: (store 4 into %stack.0 + 88, align 256, addrspace 5) + ; CHECK: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 92 + ; CHECK: [[PTR_ADD25:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C25]](s32) + ; CHECK: G_STORE [[UV23]](s32), [[PTR_ADD25]](p5) :: (store 4 into %stack.0 + 92, align 256, addrspace 5) + ; CHECK: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 96 + ; CHECK: [[PTR_ADD26:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C26]](s32) + ; CHECK: G_STORE [[UV24]](s32), [[PTR_ADD26]](p5) :: (store 4 into %stack.0 + 96, align 256, addrspace 5) + ; CHECK: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 100 + ; CHECK: [[PTR_ADD27:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C27]](s32) + ; CHECK: G_STORE [[UV25]](s32), [[PTR_ADD27]](p5) :: (store 4 into %stack.0 + 100, align 256, addrspace 5) + ; CHECK: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 104 + ; CHECK: [[PTR_ADD28:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C28]](s32) + ; CHECK: G_STORE [[UV26]](s32), [[PTR_ADD28]](p5) :: (store 4 into %stack.0 + 104, align 256, addrspace 5) + ; CHECK: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 108 + ; CHECK: [[PTR_ADD29:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C29]](s32) + ; CHECK: G_STORE [[UV27]](s32), [[PTR_ADD29]](p5) :: (store 4 into %stack.0 + 108, align 256, addrspace 5) + ; CHECK: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 112 + ; CHECK: [[PTR_ADD30:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C30]](s32) + ; CHECK: G_STORE [[UV28]](s32), [[PTR_ADD30]](p5) :: (store 4 into %stack.0 + 112, align 256, addrspace 5) + ; CHECK: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 116 + ; CHECK: [[PTR_ADD31:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C31]](s32) + ; CHECK: G_STORE [[UV29]](s32), [[PTR_ADD31]](p5) :: (store 4 into %stack.0 + 116, align 256, addrspace 5) + ; CHECK: [[C32:%[0-9]+]]:_(s32) = G_CONSTANT i32 120 + ; CHECK: [[PTR_ADD32:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C32]](s32) + ; CHECK: G_STORE [[UV30]](s32), [[PTR_ADD32]](p5) :: (store 4 into %stack.0 + 120, align 256, addrspace 5) + ; CHECK: [[C33:%[0-9]+]]:_(s32) = G_CONSTANT i32 124 + ; CHECK: [[PTR_ADD33:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C33]](s32) + ; CHECK: G_STORE [[UV31]](s32), [[PTR_ADD33]](p5) :: (store 4 into %stack.0 + 124, align 256, addrspace 5) + ; CHECK: [[C34:%[0-9]+]]:_(s32) = G_CONSTANT i32 128 + ; CHECK: [[PTR_ADD34:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C34]](s32) + ; CHECK: G_STORE [[UV32]](s32), [[PTR_ADD34]](p5) :: (store 4 into %stack.0 + 128, align 256, addrspace 5) + ; CHECK: [[C35:%[0-9]+]]:_(s32) = G_CONSTANT i32 132 + ; CHECK: [[PTR_ADD35:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C35]](s32) + ; CHECK: G_STORE [[UV33]](s32), [[PTR_ADD35]](p5) :: (store 4 into %stack.0 + 132, align 256, addrspace 5) + ; CHECK: [[C36:%[0-9]+]]:_(s32) = G_CONSTANT i32 136 + ; CHECK: [[PTR_ADD36:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C36]](s32) + ; CHECK: G_STORE [[UV34]](s32), [[PTR_ADD36]](p5) :: (store 4 into %stack.0 + 136, align 256, addrspace 5) + ; CHECK: [[C37:%[0-9]+]]:_(s32) = G_CONSTANT i32 140 + ; CHECK: [[PTR_ADD37:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C37]](s32) + ; CHECK: G_STORE [[UV35]](s32), [[PTR_ADD37]](p5) :: (store 4 into %stack.0 + 140, align 256, addrspace 5) + ; CHECK: [[C38:%[0-9]+]]:_(s32) = G_CONSTANT i32 144 + ; CHECK: [[PTR_ADD38:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C38]](s32) + ; CHECK: G_STORE [[UV36]](s32), [[PTR_ADD38]](p5) :: (store 4 into %stack.0 + 144, align 256, addrspace 5) + ; CHECK: [[C39:%[0-9]+]]:_(s32) = G_CONSTANT i32 148 + ; CHECK: [[PTR_ADD39:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C39]](s32) + ; CHECK: G_STORE [[UV37]](s32), [[PTR_ADD39]](p5) :: (store 4 into %stack.0 + 148, align 256, addrspace 5) + ; CHECK: [[C40:%[0-9]+]]:_(s32) = G_CONSTANT i32 152 + ; CHECK: [[PTR_ADD40:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C40]](s32) + ; CHECK: G_STORE [[UV38]](s32), [[PTR_ADD40]](p5) :: (store 4 into %stack.0 + 152, align 256, addrspace 5) + ; CHECK: [[C41:%[0-9]+]]:_(s32) = G_CONSTANT i32 156 + ; CHECK: [[PTR_ADD41:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C41]](s32) + ; CHECK: G_STORE [[UV39]](s32), [[PTR_ADD41]](p5) :: (store 4 into %stack.0 + 156, align 256, addrspace 5) + ; CHECK: [[C42:%[0-9]+]]:_(s32) = G_CONSTANT i32 160 + ; CHECK: [[PTR_ADD42:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C42]](s32) + ; CHECK: G_STORE [[UV40]](s32), [[PTR_ADD42]](p5) :: (store 4 into %stack.0 + 160, align 256, addrspace 5) + ; CHECK: [[C43:%[0-9]+]]:_(s32) = G_CONSTANT i32 164 + ; CHECK: [[PTR_ADD43:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C43]](s32) + ; CHECK: G_STORE [[UV41]](s32), [[PTR_ADD43]](p5) :: (store 4 into %stack.0 + 164, align 256, addrspace 5) + ; CHECK: [[C44:%[0-9]+]]:_(s32) = G_CONSTANT i32 168 + ; CHECK: [[PTR_ADD44:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C44]](s32) + ; CHECK: G_STORE [[UV42]](s32), [[PTR_ADD44]](p5) :: (store 4 into %stack.0 + 168, align 256, addrspace 5) + ; CHECK: [[C45:%[0-9]+]]:_(s32) = G_CONSTANT i32 172 + ; CHECK: [[PTR_ADD45:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C45]](s32) + ; CHECK: G_STORE [[UV43]](s32), [[PTR_ADD45]](p5) :: (store 4 into %stack.0 + 172, align 256, addrspace 5) + ; CHECK: [[C46:%[0-9]+]]:_(s32) = G_CONSTANT i32 176 + ; CHECK: [[PTR_ADD46:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C46]](s32) + ; CHECK: G_STORE [[UV44]](s32), [[PTR_ADD46]](p5) :: (store 4 into %stack.0 + 176, align 256, addrspace 5) + ; CHECK: [[C47:%[0-9]+]]:_(s32) = G_CONSTANT i32 180 + ; CHECK: [[PTR_ADD47:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C47]](s32) + ; CHECK: G_STORE [[UV45]](s32), [[PTR_ADD47]](p5) :: (store 4 into %stack.0 + 180, align 256, addrspace 5) + ; CHECK: [[C48:%[0-9]+]]:_(s32) = G_CONSTANT i32 184 + ; CHECK: [[PTR_ADD48:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C48]](s32) + ; CHECK: G_STORE [[UV46]](s32), [[PTR_ADD48]](p5) :: (store 4 into %stack.0 + 184, align 256, addrspace 5) + ; CHECK: [[C49:%[0-9]+]]:_(s32) = G_CONSTANT i32 188 + ; CHECK: [[PTR_ADD49:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C49]](s32) + ; CHECK: G_STORE [[UV47]](s32), [[PTR_ADD49]](p5) :: (store 4 into %stack.0 + 188, align 256, addrspace 5) + ; CHECK: [[C50:%[0-9]+]]:_(s32) = G_CONSTANT i32 192 + ; CHECK: [[PTR_ADD50:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C50]](s32) + ; CHECK: G_STORE [[UV48]](s32), [[PTR_ADD50]](p5) :: (store 4 into %stack.0 + 192, align 256, addrspace 5) + ; CHECK: [[C51:%[0-9]+]]:_(s32) = G_CONSTANT i32 196 + ; CHECK: [[PTR_ADD51:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C51]](s32) + ; CHECK: G_STORE [[UV49]](s32), [[PTR_ADD51]](p5) :: (store 4 into %stack.0 + 196, align 256, addrspace 5) + ; CHECK: [[C52:%[0-9]+]]:_(s32) = G_CONSTANT i32 200 + ; CHECK: [[PTR_ADD52:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C52]](s32) + ; CHECK: G_STORE [[UV50]](s32), [[PTR_ADD52]](p5) :: (store 4 into %stack.0 + 200, align 256, addrspace 5) + ; CHECK: [[C53:%[0-9]+]]:_(s32) = G_CONSTANT i32 204 + ; CHECK: [[PTR_ADD53:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C53]](s32) + ; CHECK: G_STORE [[UV51]](s32), [[PTR_ADD53]](p5) :: (store 4 into %stack.0 + 204, align 256, addrspace 5) + ; CHECK: [[C54:%[0-9]+]]:_(s32) = G_CONSTANT i32 208 + ; CHECK: [[PTR_ADD54:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C54]](s32) + ; CHECK: G_STORE [[UV52]](s32), [[PTR_ADD54]](p5) :: (store 4 into %stack.0 + 208, align 256, addrspace 5) + ; CHECK: [[C55:%[0-9]+]]:_(s32) = G_CONSTANT i32 212 + ; CHECK: [[PTR_ADD55:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C55]](s32) + ; CHECK: G_STORE [[UV53]](s32), [[PTR_ADD55]](p5) :: (store 4 into %stack.0 + 212, align 256, addrspace 5) + ; CHECK: [[C56:%[0-9]+]]:_(s32) = G_CONSTANT i32 216 + ; CHECK: [[PTR_ADD56:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C56]](s32) + ; CHECK: G_STORE [[UV54]](s32), [[PTR_ADD56]](p5) :: (store 4 into %stack.0 + 216, align 256, addrspace 5) + ; CHECK: [[C57:%[0-9]+]]:_(s32) = G_CONSTANT i32 220 + ; CHECK: [[PTR_ADD57:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C57]](s32) + ; CHECK: G_STORE [[UV55]](s32), [[PTR_ADD57]](p5) :: (store 4 into %stack.0 + 220, align 256, addrspace 5) + ; CHECK: [[C58:%[0-9]+]]:_(s32) = G_CONSTANT i32 224 + ; CHECK: [[PTR_ADD58:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C58]](s32) + ; CHECK: G_STORE [[UV56]](s32), [[PTR_ADD58]](p5) :: (store 4 into %stack.0 + 224, align 256, addrspace 5) + ; CHECK: [[C59:%[0-9]+]]:_(s32) = G_CONSTANT i32 228 + ; CHECK: [[PTR_ADD59:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C59]](s32) + ; CHECK: G_STORE [[UV57]](s32), [[PTR_ADD59]](p5) :: (store 4 into %stack.0 + 228, align 256, addrspace 5) + ; CHECK: [[C60:%[0-9]+]]:_(s32) = G_CONSTANT i32 232 + ; CHECK: [[PTR_ADD60:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C60]](s32) + ; CHECK: G_STORE [[UV58]](s32), [[PTR_ADD60]](p5) :: (store 4 into %stack.0 + 232, align 256, addrspace 5) + ; CHECK: [[C61:%[0-9]+]]:_(s32) = G_CONSTANT i32 236 + ; CHECK: [[PTR_ADD61:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C61]](s32) + ; CHECK: G_STORE [[UV59]](s32), [[PTR_ADD61]](p5) :: (store 4 into %stack.0 + 236, align 256, addrspace 5) + ; CHECK: [[C62:%[0-9]+]]:_(s32) = G_CONSTANT i32 240 + ; CHECK: [[PTR_ADD62:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C62]](s32) + ; CHECK: G_STORE [[UV60]](s32), [[PTR_ADD62]](p5) :: (store 4 into %stack.0 + 240, align 256, addrspace 5) + ; CHECK: [[C63:%[0-9]+]]:_(s32) = G_CONSTANT i32 244 + ; CHECK: [[PTR_ADD63:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C63]](s32) + ; CHECK: G_STORE [[UV61]](s32), [[PTR_ADD63]](p5) :: (store 4 into %stack.0 + 244, align 256, addrspace 5) + ; CHECK: [[C64:%[0-9]+]]:_(s32) = G_CONSTANT i32 248 + ; CHECK: [[PTR_ADD64:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C64]](s32) + ; CHECK: G_STORE [[UV62]](s32), [[PTR_ADD64]](p5) :: (store 4 into %stack.0 + 248, align 256, addrspace 5) + ; CHECK: [[C65:%[0-9]+]]:_(s32) = G_CONSTANT i32 252 + ; CHECK: [[PTR_ADD65:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C65]](s32) + ; CHECK: G_STORE [[UV63]](s32), [[PTR_ADD65]](p5) :: (store 4 into %stack.0 + 252, align 256, addrspace 5) + ; CHECK: [[C66:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C66]] + ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND]], [[C3]] + ; CHECK: [[PTR_ADD66:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD66]](p5) :: (load 4, addrspace 5) + ; CHECK: S_ENDPGM 0, implicit [[LOAD4]](s32) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = COPY $sgpr2 + %2:_(<64 x s32>) = G_LOAD %0 :: (load 256, align 4, addrspace 4) + %3:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1 + S_ENDPGM 0, implicit %3 +... + +--- +name: extract_vector_elt_v32s1_varidx_i32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v32s1_varidx_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C4]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C5]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C6]](s32) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C7]](s32) + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C8]](s32) + ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C9]](s32) + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 + ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C10]](s32) + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C11]](s32) + ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C12]](s32) + ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C13]](s32) + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C14]](s32) + ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C15]](s32) + ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C16]](s32) + ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 18 + ; CHECK: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C17]](s32) + ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 19 + ; CHECK: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C18]](s32) + ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C19]](s32) + ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 21 + ; CHECK: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C20]](s32) + ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 22 + ; CHECK: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C21]](s32) + ; CHECK: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; CHECK: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C22]](s32) + ; CHECK: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C23]](s32) + ; CHECK: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 + ; CHECK: [[LSHR24:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C24]](s32) + ; CHECK: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 26 + ; CHECK: [[LSHR25:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C25]](s32) + ; CHECK: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 27 + ; CHECK: [[LSHR26:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C26]](s32) + ; CHECK: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK: [[LSHR27:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C27]](s32) + ; CHECK: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 + ; CHECK: [[LSHR28:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C28]](s32) + ; CHECK: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 + ; CHECK: [[LSHR29:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C29]](s32) + ; CHECK: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[LSHR30:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C30]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C3]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C]] + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C7]](s32) + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C8]](s32) + ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C]] + ; CHECK: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C9]](s32) + ; CHECK: [[OR9:%[0-9]+]]:_(s32) = G_OR [[OR8]], [[SHL9]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C]] + ; CHECK: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C10]](s32) + ; CHECK: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR11]](s32) + ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C]] + ; CHECK: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C11]](s32) + ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32) + ; CHECK: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C]] + ; CHECK: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C12]](s32) + ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR11]], [[SHL12]] + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR13]](s32) + ; CHECK: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C]] + ; CHECK: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C13]](s32) + ; CHECK: [[OR13:%[0-9]+]]:_(s32) = G_OR [[OR12]], [[SHL13]] + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR14]](s32) + ; CHECK: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C]] + ; CHECK: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C14]](s32) + ; CHECK: [[OR14:%[0-9]+]]:_(s32) = G_OR [[OR13]], [[SHL14]] + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR15]](s32) + ; CHECK: [[AND16:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C]] + ; CHECK: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND16]], [[C15]](s32) + ; CHECK: [[OR15:%[0-9]+]]:_(s32) = G_OR [[OR14]], [[SHL15]] + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LSHR16]](s32) + ; CHECK: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C]] + ; CHECK: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[C16]](s32) + ; CHECK: [[OR16:%[0-9]+]]:_(s32) = G_OR [[OR15]], [[SHL16]] + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LSHR17]](s32) + ; CHECK: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C]] + ; CHECK: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C17]](s32) + ; CHECK: [[OR17:%[0-9]+]]:_(s32) = G_OR [[OR16]], [[SHL17]] + ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR18]](s32) + ; CHECK: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C]] + ; CHECK: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[C18]](s32) + ; CHECK: [[OR18:%[0-9]+]]:_(s32) = G_OR [[OR17]], [[SHL18]] + ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LSHR19]](s32) + ; CHECK: [[AND20:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C]] + ; CHECK: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND20]], [[C19]](s32) + ; CHECK: [[OR19:%[0-9]+]]:_(s32) = G_OR [[OR18]], [[SHL19]] + ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LSHR20]](s32) + ; CHECK: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C]] + ; CHECK: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[C20]](s32) + ; CHECK: [[OR20:%[0-9]+]]:_(s32) = G_OR [[OR19]], [[SHL20]] + ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LSHR21]](s32) + ; CHECK: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C]] + ; CHECK: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND22]], [[C21]](s32) + ; CHECK: [[OR21:%[0-9]+]]:_(s32) = G_OR [[OR20]], [[SHL21]] + ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LSHR22]](s32) + ; CHECK: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C]] + ; CHECK: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[C22]](s32) + ; CHECK: [[OR22:%[0-9]+]]:_(s32) = G_OR [[OR21]], [[SHL22]] + ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR23]](s32) + ; CHECK: [[AND24:%[0-9]+]]:_(s32) = G_AND [[COPY26]], [[C]] + ; CHECK: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[AND24]], [[C23]](s32) + ; CHECK: [[OR23:%[0-9]+]]:_(s32) = G_OR [[OR22]], [[SHL23]] + ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LSHR24]](s32) + ; CHECK: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C]] + ; CHECK: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[C24]](s32) + ; CHECK: [[OR24:%[0-9]+]]:_(s32) = G_OR [[OR23]], [[SHL24]] + ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY [[LSHR25]](s32) + ; CHECK: [[AND26:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C]] + ; CHECK: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[AND26]], [[C25]](s32) + ; CHECK: [[OR25:%[0-9]+]]:_(s32) = G_OR [[OR24]], [[SHL25]] + ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LSHR26]](s32) + ; CHECK: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C]] + ; CHECK: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[C26]](s32) + ; CHECK: [[OR26:%[0-9]+]]:_(s32) = G_OR [[OR25]], [[SHL26]] + ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY [[LSHR27]](s32) + ; CHECK: [[AND28:%[0-9]+]]:_(s32) = G_AND [[COPY30]], [[C]] + ; CHECK: [[SHL27:%[0-9]+]]:_(s32) = G_SHL [[AND28]], [[C27]](s32) + ; CHECK: [[OR27:%[0-9]+]]:_(s32) = G_OR [[OR26]], [[SHL27]] + ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LSHR28]](s32) + ; CHECK: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C]] + ; CHECK: [[SHL28:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[C28]](s32) + ; CHECK: [[OR28:%[0-9]+]]:_(s32) = G_OR [[OR27]], [[SHL28]] + ; CHECK: [[COPY32:%[0-9]+]]:_(s32) = COPY [[LSHR29]](s32) + ; CHECK: [[AND30:%[0-9]+]]:_(s32) = G_AND [[COPY32]], [[C]] + ; CHECK: [[SHL29:%[0-9]+]]:_(s32) = G_SHL [[AND30]], [[C29]](s32) + ; CHECK: [[OR29:%[0-9]+]]:_(s32) = G_OR [[OR28]], [[SHL29]] + ; CHECK: [[COPY33:%[0-9]+]]:_(s32) = COPY [[LSHR30]](s32) + ; CHECK: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY33]], [[C]] + ; CHECK: [[SHL30:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[C30]](s32) + ; CHECK: [[OR30:%[0-9]+]]:_(s32) = G_OR [[OR29]], [[SHL30]] + ; CHECK: [[AND32:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C30]] + ; CHECK: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[SHL31:%[0-9]+]]:_(s32) = G_SHL [[AND32]], [[C31]](s32) + ; CHECK: [[LSHR31:%[0-9]+]]:_(s32) = G_LSHR [[OR30]], [[SHL31]](s32) + ; CHECK: [[COPY34:%[0-9]+]]:_(s32) = COPY [[LSHR31]](s32) + ; CHECK: $vgpr0 = COPY [[COPY34]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<32 x s1>) = G_BITCAST %0 + %3:_(s1) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v12s8_varidx_s32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3 + ; CHECK-LABEL: name: extract_vector_elt_v12s8_varidx_s32 + ; CHECK: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C1]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C1]](s32) + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C4]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[LSHR9]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CHECK: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C5]](s32) + ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL9]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; CHECK: $vgpr0 = COPY [[COPY14]](s32) + %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<12 x s8>) = G_BITCAST %0 + %2:_(s32) = COPY $vgpr3 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v3s8_varidx_s32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: extract_vector_elt_v3s8_varidx_s32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 8 + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY7]], 8 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[COPY1]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) + ; CHECK: $vgpr0 = COPY [[COPY8]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s24) = G_TRUNC %0 + %3:_(<3 x s8>) = G_BITCAST %2 + %4:_(s8) = G_EXTRACT_VECTOR_ELT %3, %1 + %5:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract.mir index 9bb25356c950c..0e23fba6afaaa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract.mir @@ -522,11 +522,14 @@ body: | ; CHECK-LABEL: name: extract_s8_v3s8_offset16 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; CHECK: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[DEF]](<4 x s32>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s8>) = G_EXTRACT [[TRUNC]](<4 x s8>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s8>) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s8) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s8) - ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV2]](s8) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[DEF1]](<4 x s32>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC1]](<4 x s8>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s8>), [[UV1:%[0-9]+]]:_(<3 x s8>), [[UV2:%[0-9]+]]:_(<3 x s8>), [[UV3:%[0-9]+]]:_(<3 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s8>) + ; CHECK: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV]](<3 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV4]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV5]](s8) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV6]](s8) ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT]](s16) ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT1]](s16) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 @@ -538,12 +541,12 @@ body: | ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>) - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT]](<4 x s16>), 32 - ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT2]](s16) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16) ; CHECK: $vgpr0 = COPY [[ANYEXT3]](s32) %0:_(<3 x s8>) = G_IMPLICIT_DEF %1:_(s8) = G_EXTRACT %0, 16 @@ -558,13 +561,16 @@ body: | ; CHECK-LABEL: name: extract_s8_v5s1_offset4 ; CHECK: [[DEF:%[0-9]+]]:_(<6 x s32>) = G_IMPLICIT_DEF ; CHECK: [[TRUNC:%[0-9]+]]:_(<6 x s1>) = G_TRUNC [[DEF]](<6 x s32>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<5 x s1>) = G_EXTRACT [[TRUNC]](<6 x s1>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(s1), [[UV1:%[0-9]+]]:_(s1), [[UV2:%[0-9]+]]:_(s1), [[UV3:%[0-9]+]]:_(s1), [[UV4:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[EXTRACT]](<5 x s1>) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV]](s1) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV1]](s1) - ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV2]](s1) - ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s16) = G_ANYEXT [[UV3]](s1) - ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s16) = G_ANYEXT [[UV4]](s1) + ; CHECK: [[DEF1:%[0-9]+]]:_(<6 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<6 x s1>) = G_TRUNC [[DEF1]](<6 x s32>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<30 x s1>) = G_CONCAT_VECTORS [[TRUNC]](<6 x s1>), [[TRUNC1]](<6 x s1>), [[TRUNC1]](<6 x s1>), [[TRUNC1]](<6 x s1>), [[TRUNC1]](<6 x s1>) + ; CHECK: [[UV:%[0-9]+]]:_(<5 x s1>), [[UV1:%[0-9]+]]:_(<5 x s1>), [[UV2:%[0-9]+]]:_(<5 x s1>), [[UV3:%[0-9]+]]:_(<5 x s1>), [[UV4:%[0-9]+]]:_(<5 x s1>), [[UV5:%[0-9]+]]:_(<5 x s1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<30 x s1>) + ; CHECK: [[UV6:%[0-9]+]]:_(s1), [[UV7:%[0-9]+]]:_(s1), [[UV8:%[0-9]+]]:_(s1), [[UV9:%[0-9]+]]:_(s1), [[UV10:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[UV]](<5 x s1>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[UV6]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[UV7]](s1) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[UV8]](s1) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s16) = G_ANYEXT [[UV9]](s1) + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s16) = G_ANYEXT [[UV10]](s1) ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT]](s16) ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ANYEXT1]](s16) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 @@ -581,12 +587,12 @@ body: | ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL2]] ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; CHECK: [[DEF1:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<5 x s16>), 0 - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT]](<6 x s16>), 64 - ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT2]](s16) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 + ; CHECK: [[DEF2:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<5 x s16>), 0 + ; CHECK: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT]](<6 x s16>), 64 + ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16) ; CHECK: $vgpr0 = COPY [[ANYEXT5]](s32) %0:_(<5 x s1>) = G_IMPLICIT_DEF %1:_(s1) = G_EXTRACT %0, 4 @@ -850,10 +856,12 @@ body: | ; CHECK-LABEL: name: extract_s16_v3s16_offset0 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT]](<4 x s16>), 0 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(s16) = G_EXTRACT %0, 0 @@ -933,10 +941,12 @@ body: | ; CHECK-LABEL: name: extract_v2s16_v3s16_offset0 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 - ; CHECK: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; CHECK: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<2 x s16>) = G_EXTRACT %0, 0 $vgpr0 = COPY %1 @@ -949,10 +959,12 @@ body: | ; CHECK-LABEL: name: extract_v2s16_v5s16_offset0 ; CHECK: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[DEF]](<6 x s16>), 0 - ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<5 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<6 x s16>), 0 - ; CHECK: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[DEF]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>), [[UV2:%[0-9]+]]:_(<5 x s16>), [[UV3:%[0-9]+]]:_(<5 x s16>), [[UV4:%[0-9]+]]:_(<5 x s16>), [[UV5:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<30 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[UV]](<5 x s16>), 0 + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<6 x s16>), 0 + ; CHECK: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) %0:_(<5 x s16>) = G_IMPLICIT_DEF %1:_(<2 x s16>) = G_EXTRACT %0, 0 $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir index e693766954c98..0857d286ff5c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir @@ -209,13 +209,15 @@ body: | ; SI-LABEL: name: test_fabs_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) @@ -231,21 +233,23 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[FABS:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BITCAST2]] ; SI: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BITCAST3]] - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF1]](<2 x s16>) - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fabs_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) @@ -261,34 +265,36 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[FABS:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BITCAST2]] ; VI: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BITCAST3]] - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF1]](<2 x s16>) - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fabs_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF1]](s32) - ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF2]](s32) + ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[FABS:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BUILD_VECTOR_TRUNC]] ; GFX9: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF2]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>), [[DEF3]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FABS %0 S_NOP 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir index 50d846ca2367a..1235d999f2e9c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir @@ -330,26 +330,29 @@ body: | ; SI-LABEL: name: test_fadd_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -374,31 +377,34 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fadd_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC]], [[TRUNC3]] @@ -414,44 +420,47 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fadd_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF2]](s32) - ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF3]](s32) + ; GFX9: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF3]](s32) ; GFX9: [[FADD:%[0-9]+]]:_(<2 x s16>) = G_FADD [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] ; GFX9: [[FADD1:%[0-9]+]]:_(<2 x s16>) = G_FADD [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<2 x s16>), [[FADD1]](<2 x s16>), [[DEF3]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<2 x s16>), [[FADD1]](<2 x s16>), [[DEF4]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_FADD %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir index 9efb04c35f6e9..e5b9fe0b86562 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir @@ -222,15 +222,17 @@ body: | ; SI-LABEL: name: test_fcanonicalize_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -252,20 +254,22 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fcanonicalize_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC]] @@ -281,31 +285,33 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fcanonicalize_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF1]](s32) - ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF2]](s32) + ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC]] ; GFX9: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FCANONICALIZE]](<2 x s16>), [[FCANONICALIZE1]](<2 x s16>), [[DEF2]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FCANONICALIZE]](<2 x s16>), [[FCANONICALIZE1]](<2 x s16>), [[DEF3]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FCANONICALIZE %0 S_NOP 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir index a6b5fc876f7d6..f97896ba623d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir @@ -326,15 +326,17 @@ body: | ; SI-LABEL: name: test_fcos_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -363,20 +365,22 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fcos_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3118 @@ -399,20 +403,22 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fcos_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3118 @@ -426,11 +432,11 @@ body: | ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF1]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF2]](s32) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FCOS %0 S_NOP 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir index 3ed48e39e57e3..3236b8c5a3d06 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -1307,26 +1307,29 @@ body: | ; SI-LABEL: name: test_fdiv_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -1385,31 +1388,34 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fdiv_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -1440,31 +1446,34 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fdiv_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -1489,33 +1498,36 @@ body: | ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) - ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF2]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF3]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s16 ; GFX9-UNSAFE: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-UNSAFE: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9-UNSAFE: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-UNSAFE: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX9-UNSAFE: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9-UNSAFE: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-UNSAFE: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-UNSAFE: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9-UNSAFE: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9-UNSAFE: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-UNSAFE: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9-UNSAFE: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9-UNSAFE: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9-UNSAFE: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; GFX9-UNSAFE: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-UNSAFE: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-UNSAFE: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9-UNSAFE: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-UNSAFE: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-UNSAFE: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; GFX9-UNSAFE: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-UNSAFE: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-UNSAFE: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9-UNSAFE: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9-UNSAFE: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-UNSAFE: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9-UNSAFE: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9-UNSAFE: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; GFX9-UNSAFE: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-UNSAFE: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9-UNSAFE: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; GFX9-UNSAFE: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-UNSAFE: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; GFX9-UNSAFE: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-UNSAFE: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX9-UNSAFE: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC3]](s16) @@ -1528,33 +1540,36 @@ body: | ; GFX9-UNSAFE: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL1]](s16) ; GFX9-UNSAFE: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9-UNSAFE: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL2]](s16) - ; GFX9-UNSAFE: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-UNSAFE: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF2]](s32) - ; GFX9-UNSAFE: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-UNSAFE: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9-UNSAFE: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9-UNSAFE: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9-UNSAFE: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF3]](s32) + ; GFX9-UNSAFE: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-UNSAFE: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; GFX9-UNSAFE: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX10-LABEL: name: test_fdiv_v3s16 ; GFX10: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX10: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX10: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX10: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX10: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; GFX10: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX10: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX10: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX10: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX10: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX10: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; GFX10: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX10: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX10: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX10: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX10: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX10: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; GFX10: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX10: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; GFX10: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX10: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; GFX10: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX10: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -1579,11 +1594,11 @@ body: | ; GFX10: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX10: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) - ; GFX10: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF2]](s32) - ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX10: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX10: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX10: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF3]](s32) + ; GFX10: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; GFX10: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_FDIV %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir index b1eec4dfee3d3..4466b4c521c56 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir @@ -322,15 +322,17 @@ body: | ; SI-LABEL: name: test_ffloor_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -352,20 +354,22 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_ffloor_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI: [[FFLOOR:%[0-9]+]]:_(s16) = G_FFLOOR [[TRUNC]] @@ -381,20 +385,22 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_ffloor_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[FFLOOR:%[0-9]+]]:_(s16) = G_FFLOOR [[TRUNC]] @@ -404,11 +410,11 @@ body: | ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FFLOOR1]](s16) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FFLOOR2]](s16) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF1]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF2]](s32) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FFLOOR %0 S_NOP 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir index 06f237f7a3050..56c51dfa726e3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir @@ -393,37 +393,41 @@ body: | ; SI-LABEL: name: test_fma_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF2]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT2]](<3 x s16>), 0 - ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 + ; SI: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) - ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -451,42 +455,46 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) - ; SI: [[EXTRACT3:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT3]](<3 x s16>) + ; SI: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS3]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fma_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF2]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT2]](<3 x s16>), 0 - ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 + ; VI: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) - ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) ; VI: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC3]], [[TRUNC6]] @@ -502,57 +510,61 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) - ; VI: [[EXTRACT3:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT3]](<3 x s16>) + ; VI: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS3]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fma_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF2]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF3]](s32) - ; GFX9: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF4]](s32) + ; GFX9: [[DEF5:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV14]](<2 x s16>) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV15]](<2 x s16>) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF3]](s32) - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT2]](<3 x s16>), 0 - ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) - ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF4]](s32) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 + ; GFX9: [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT2]](<4 x s16>) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV16]](<2 x s16>) ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) - ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV17]](<2 x s16>) ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[DEF3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[DEF4]](s32) ; GFX9: [[FMA:%[0-9]+]]:_(<2 x s16>) = G_FMA [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]], [[BUILD_VECTOR_TRUNC4]] ; GFX9: [[FMA1:%[0-9]+]]:_(<2 x s16>) = G_FMA [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<2 x s16>), [[FMA1]](<2 x s16>), [[DEF4]](<2 x s16>) - ; GFX9: [[EXTRACT3:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT3]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<2 x s16>), [[FMA1]](<2 x s16>), [[DEF5]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS3]](<6 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir index 8b9b0e972e6f4..24253aaeb4c38 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir @@ -506,8 +506,9 @@ body: | ; GFX9: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV3]] ; GFX9: [[FMAXNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FMAXNUM_IEEE]](<2 x s16>), [[FMAXNUM_IEEE1]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir index f6456cd57f01e..723168a2a8ec3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir @@ -506,8 +506,9 @@ body: | ; GFX9: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV3]] ; GFX9: [[FMINNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FMINNUM_IEEE]](<2 x s16>), [[FMINNUM_IEEE1]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir index 45130639f696f..ea9b1e66d7e13 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir @@ -329,26 +329,29 @@ body: | ; SI-LABEL: name: test_fmul_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -373,31 +376,34 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fmul_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC3]] @@ -413,44 +419,47 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fmul_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF2]](s32) - ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF3]](s32) + ; GFX9: [[DEF4:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF3]](s32) ; GFX9: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] ; GFX9: [[FMUL1:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMUL]](<2 x s16>), [[FMUL1]](<2 x s16>), [[DEF3]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMUL]](<2 x s16>), [[FMUL1]](<2 x s16>), [[DEF4]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<6 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_FMUL %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir index 75e498b3a2e75..35f229088167c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir @@ -207,13 +207,15 @@ body: | ; SI-LABEL: name: test_fneg_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) @@ -229,21 +231,23 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; SI: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BITCAST2]] ; SI: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BITCAST3]] - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF1]](<2 x s16>) - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fneg_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) @@ -259,34 +263,36 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; VI: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BITCAST2]] ; VI: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BITCAST3]] - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF1]](<2 x s16>) - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fneg_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF1]](s32) - ; GFX9: [[DEF2:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[DEF2]](s32) + ; GFX9: [[DEF3:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF ; GFX9: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BUILD_VECTOR_TRUNC]] ; GFX9: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF2]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>), [[DEF3]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<6 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FNEG %0 S_NOP 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpext.mir index c6ba0b60d99dd..844c972a6dfab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpext.mir @@ -70,15 +70,17 @@ body: | ; CHECK-LABEL: name: test_fpext_v3f16_to_v3f32 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; CHECK: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-freeze.mir index 460d23b441a0b..8b7460d6a73a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-freeze.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-freeze.mir @@ -422,19 +422,20 @@ body: | bb.0: ; CHECK-LABEL: name: test_freeze_v33s32 - ; CHECK: [[DEF:%[0-9]+]]:_(<33 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<33 x s32>) + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<528 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<33 x s32>) = G_EXTRACT [[CONCAT_VECTORS]](<528 x s32>), 0 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<33 x s32>) ; CHECK: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32), [[UV8]](s32), [[UV9]](s32), [[UV10]](s32), [[UV11]](s32), [[UV12]](s32), [[UV13]](s32), [[UV14]](s32), [[UV15]](s32) ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[UV16]](s32), [[UV17]](s32), [[UV18]](s32), [[UV19]](s32), [[UV20]](s32), [[UV21]](s32), [[UV22]](s32), [[UV23]](s32), [[UV24]](s32), [[UV25]](s32), [[UV26]](s32), [[UV27]](s32), [[UV28]](s32), [[UV29]](s32), [[UV30]](s32), [[UV31]](s32) ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[UV32]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32), [[DEF1]](s32) - ; CHECK: [[DEF2:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF ; CHECK: [[FREEZE:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[BUILD_VECTOR]] ; CHECK: [[FREEZE1:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[BUILD_VECTOR1]] ; CHECK: [[FREEZE2:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[BUILD_VECTOR2]] - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<528 x s32>) = G_CONCAT_VECTORS [[FREEZE]](<16 x s32>), [[FREEZE1]](<16 x s32>), [[FREEZE2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>), [[DEF2]](<16 x s32>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<33 x s32>) = G_EXTRACT [[CONCAT_VECTORS]](<528 x s32>), 0 - ; CHECK: S_NOP 0, implicit [[EXTRACT]](<33 x s32>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<528 x s32>) = G_CONCAT_VECTORS [[FREEZE]](<16 x s32>), [[FREEZE1]](<16 x s32>), [[FREEZE2]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>) + ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<33 x s32>) = G_EXTRACT [[CONCAT_VECTORS1]](<528 x s32>), 0 + ; CHECK: S_NOP 0, implicit [[EXTRACT1]](<33 x s32>) %0:_(<33 x s32>) = G_IMPLICIT_DEF %1:_(<33 x s32>) = G_FREEZE %0 S_NOP 0, implicit %1 @@ -447,13 +448,10 @@ body: | ; CHECK-LABEL: name: test_freeze_v64s32 ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) - ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) - ; CHECK: [[COPY2:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) ; CHECK: [[FREEZE:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]] - ; CHECK: [[FREEZE1:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[COPY]] - ; CHECK: [[FREEZE2:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[COPY1]] - ; CHECK: [[FREEZE3:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[COPY2]] + ; CHECK: [[FREEZE1:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]] + ; CHECK: [[FREEZE2:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]] + ; CHECK: [[FREEZE3:%[0-9]+]]:_(<16 x s32>) = G_FREEZE [[DEF]] ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[FREEZE]](<16 x s32>), [[FREEZE1]](<16 x s32>), [[FREEZE2]](<16 x s32>), [[FREEZE3]](<16 x s32>) ; CHECK: S_NOP 0, implicit [[CONCAT_VECTORS]](<64 x s32>) %0:_(<64 x s32>) = G_IMPLICIT_DEF @@ -490,8 +488,9 @@ body: | ; CHECK: [[ANYEXT:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[INSERT]](<4 x s1>) ; CHECK: [[FREEZE:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[ANYEXT]] ; CHECK: [[TRUNC:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[FREEZE]](<4 x s32>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s1>) = G_EXTRACT [[TRUNC]](<4 x s1>), 0 - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[EXTRACT]](<3 x s1>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s1>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s1>), [[DEF1]](<4 x s1>), [[DEF1]](<4 x s1>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s1>), [[UV1:%[0-9]+]]:_(<3 x s1>), [[UV2:%[0-9]+]]:_(<3 x s1>), [[UV3:%[0-9]+]]:_(<3 x s1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s1>) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[UV]](<3 x s1>) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[ANYEXT1]](<3 x s32>) %0:_(<3 x s1>) = G_IMPLICIT_DEF %1:_(<3 x s1>) = G_FREEZE %0 @@ -530,8 +529,9 @@ body: | ; CHECK: [[ANYEXT:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[INSERT]](<4 x s8>) ; CHECK: [[FREEZE:%[0-9]+]]:_(<4 x s32>) = G_FREEZE [[ANYEXT]] ; CHECK: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[FREEZE]](<4 x s32>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s8>) = G_EXTRACT [[TRUNC1]](<4 x s8>), 0 - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[EXTRACT]](<3 x s8>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s8>) = G_CONCAT_VECTORS [[TRUNC1]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s8>), [[UV1:%[0-9]+]]:_(<3 x s8>), [[UV2:%[0-9]+]]:_(<3 x s8>), [[UV3:%[0-9]+]]:_(<3 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s8>) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[UV]](<3 x s8>) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[ANYEXT1]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s8>) = G_TRUNC %0 @@ -573,13 +573,14 @@ body: | ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[FREEZE:%[0-9]+]]:_(<4 x s16>) = G_FREEZE [[INSERT]] - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[FREEZE]](<4 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 - ; CHECK: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[FREEZE]](<4 x s16>), [[DEF2]](<4 x s16>), [[DEF2]](<4 x s16>) + ; CHECK: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[UV3]](<3 x s16>), 0 + ; CHECK: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) @@ -628,15 +629,16 @@ body: | ; CHECK: [[DEF2:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<5 x s16>), 0 ; CHECK: [[FREEZE:%[0-9]+]]:_(<6 x s16>) = G_FREEZE [[INSERT]] - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[FREEZE]](<6 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<5 x s16>), 0 - ; CHECK: [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[FREEZE]](<6 x s16>), [[DEF2]](<6 x s16>), [[DEF2]](<6 x s16>), [[DEF2]](<6 x s16>), [[DEF2]](<6 x s16>) + ; CHECK: [[UV5:%[0-9]+]]:_(<5 x s16>), [[UV6:%[0-9]+]]:_(<5 x s16>), [[UV7:%[0-9]+]]:_(<5 x s16>), [[UV8:%[0-9]+]]:_(<5 x s16>), [[UV9:%[0-9]+]]:_(<5 x s16>), [[UV10:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<30 x s16>) + ; CHECK: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[UV5]](<5 x s16>), 0 + ; CHECK: [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV13]](<2 x s16>) ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir index 20ba990e8bdb1..740f023b93ab1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir @@ -326,15 +326,17 @@ body: | ; SI-LABEL: name: test_fsin_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -363,20 +365,22 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fsin_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3118 @@ -399,20 +403,22 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fsin_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3118 @@ -426,11 +432,11 @@ body: | ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF1]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF2]](s32) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FSIN %0 S_NOP 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir index 9508109707cc1..4be1a32f81148 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir @@ -246,15 +246,17 @@ body: | ; SI-LABEL: name: test_fsqrt_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; SI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) @@ -276,20 +278,22 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fsqrt_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; VI: [[FSQRT:%[0-9]+]]:_(s16) = G_FSQRT [[TRUNC]] @@ -305,20 +309,22 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fsqrt_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[FSQRT:%[0-9]+]]:_(s16) = G_FSQRT [[TRUNC]] @@ -328,11 +334,11 @@ body: | ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT1]](s16) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT2]](s16) - ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF1]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT1]](<3 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF2]](s32) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS1]](<4 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FSQRT %0 S_NOP 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir index 7df607de5ed84..710e0b405fc9c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir @@ -374,26 +374,29 @@ body: | ; SI-LABEL: name: test_fsub_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC3]] @@ -421,31 +424,34 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_fsub_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC3]] @@ -464,31 +470,34 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_fsub_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX9: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC3]] @@ -501,11 +510,11 @@ body: | ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) ; GFX9: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD2]](s16) - ; GFX9: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF2]](s32) - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[DEF3]](s32) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; GFX9: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_FSUB %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir index 1ce85a95ef6db..5fe42c120db81 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir @@ -327,14 +327,22 @@ body: | name: test_implicit_def_v33s32 body: | bb.0: + liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: test_implicit_def_v33s32 - ; CHECK: [[DEF:%[0-9]+]]:_(<33 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<33 x s32>) - ; CHECK: S_NOP 0, implicit [[UV]](s32), implicit [[UV32]](s32) + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<528 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(<33 x s32>) = G_EXTRACT [[CONCAT_VECTORS]](<528 x s32>), 0 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[EXTRACT]](<33 x s32>) + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: G_STORE [[UV]](s32), [[COPY]](p1) :: (volatile store 4, addrspace 1) + ; CHECK: G_STORE [[UV32]](s32), [[COPY]](p1) :: (volatile store 4, addrspace 1) %0:_(<33 x s32>) = G_IMPLICIT_DEF %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32), %6:_(s32), %7:_(s32), %8:_(s32), %9:_(s32), %10:_(s32), %11:_(s32), %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32), %16:_(s32), %17:_(s32), %18:_(s32), %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32), %23:_(s32), %24:_(s32), %25:_(s32), %26:_(s32), %27:_(s32), %28:_(s32), %29:_(s32), %30:_(s32), %31:_(s32), %32:_(s32), %33:_(s32) = G_UNMERGE_VALUES %0 - S_NOP 0, implicit %1, implicit %33 + %34:_(p1) = COPY $vgpr0_vgpr1 + G_STORE %1, %34 :: (volatile store 4, align 4, addrspace 1) + G_STORE %33, %34 :: (volatile store 4, align 4, addrspace 1) + ... --- @@ -344,11 +352,8 @@ body: | ; CHECK-LABEL: name: test_implicit_def_v64s32 ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) - ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) - ; CHECK: [[COPY2:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[COPY]](<16 x s32>), [[COPY1]](<16 x s32>), [[COPY2]](<16 x s32>) - ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[COPY]](<16 x s32>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>), [[DEF]](<16 x s32>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[DEF]](<16 x s32>) ; CHECK: S_NOP 0, implicit [[CONCAT_VECTORS]](<64 x s32>), implicit [[CONCAT_VECTORS1]](<32 x s32>) %0:_(<64 x s32>) = G_IMPLICIT_DEF %1:_(<32 x s32>), %2:_(<32 x s32>) = G_UNMERGE_VALUES %0 @@ -377,11 +382,14 @@ body: | ; CHECK-LABEL: name: test_implicit_def_v3s1 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; CHECK: [[TRUNC:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[DEF]](<4 x s32>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s1>) = G_EXTRACT [[TRUNC]](<4 x s1>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(s1), [[UV1:%[0-9]+]]:_(s1), [[UV2:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s1>) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s1) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s1) - ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s1) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[DEF1]](<4 x s32>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s1>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s1>), [[TRUNC1]](<4 x s1>), [[TRUNC1]](<4 x s1>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s1>), [[UV1:%[0-9]+]]:_(<3 x s1>), [[UV2:%[0-9]+]]:_(<3 x s1>), [[UV3:%[0-9]+]]:_(<3 x s1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s1>) + ; CHECK: [[UV4:%[0-9]+]]:_(s1), [[UV5:%[0-9]+]]:_(s1), [[UV6:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[UV]](<3 x s1>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s1) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s1) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s1>) = G_IMPLICIT_DEF @@ -411,11 +419,14 @@ body: | ; CHECK-LABEL: name: test_implicit_def_v3s8 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF ; CHECK: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[DEF]](<4 x s32>) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s8>) = G_EXTRACT [[TRUNC]](<4 x s8>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s8>) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8) - ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[DEF1]](<4 x s32>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC1]](<4 x s8>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s8>), [[UV1:%[0-9]+]]:_(<3 x s8>), [[UV2:%[0-9]+]]:_(<3 x s8>), [[UV3:%[0-9]+]]:_(<3 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s8>) + ; CHECK: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV]](<3 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s8>) = G_IMPLICIT_DEF @@ -442,9 +453,10 @@ body: | ; CHECK-LABEL: name: test_implicit_def_v3s16 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %1(<4 x s16>), %1(<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<4 x s16>) = G_IMPLICIT_DEF @@ -471,9 +483,11 @@ body: | ; CHECK-LABEL: name: test_implicit_def_v5s16 ; CHECK: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[DEF]](<6 x s16>), 0 - ; CHECK: [[DEF1:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<5 x s16>), 0 + ; CHECK: [[DEF1:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[DEF]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>), [[UV2:%[0-9]+]]:_(<5 x s16>), [[UV3:%[0-9]+]]:_(<5 x s16>), [[UV4:%[0-9]+]]:_(<5 x s16>), [[UV5:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<30 x s16>) + ; CHECK: [[DEF2:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[INSERT:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF2]], [[UV]](<5 x s16>), 0 ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](<8 x s16>) %0:_(<5 x s16>) = G_IMPLICIT_DEF %1:_(<8 x s16>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir index 37971d389fe4e..99ff3fff27dda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir @@ -843,8 +843,9 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[TRUNC]](s16), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_EXTRACT %0, 0 @@ -869,8 +870,9 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[TRUNC]](s16), 16 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_EXTRACT %0, 0 @@ -895,8 +897,9 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[TRUNC]](s16), 32 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_EXTRACT %0, 0 @@ -920,8 +923,9 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[COPY1]](<2 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_EXTRACT %0, 0 @@ -944,8 +948,9 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[COPY1]](<2 x s16>), 16 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_EXTRACT %0, 0 @@ -968,8 +973,9 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[COPY1]](s32), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_EXTRACT %0, 0 @@ -992,8 +998,9 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[COPY1]](s32), 16 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_EXTRACT %0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll index 86d7a2f4e4dbd..e566572763e57 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll @@ -16,8 +16,10 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -38,8 +40,10 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -67,12 +71,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -94,12 +100,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -130,15 +138,18 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -163,15 +174,18 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -205,15 +219,18 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -238,15 +255,18 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -279,12 +299,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -306,12 +328,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -342,15 +366,18 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -375,15 +402,18 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -417,15 +447,18 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -450,15 +483,18 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -492,18 +528,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -527,18 +566,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -571,12 +613,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -598,12 +642,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -634,15 +680,18 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -667,15 +716,18 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -709,18 +761,21 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -744,18 +799,21 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -789,18 +847,21 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -824,18 +885,21 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -869,15 +933,18 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -902,15 +969,18 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -944,18 +1014,21 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -979,18 +1052,21 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -1028,8 +1104,10 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d @@ -1050,8 +1128,10 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -1079,12 +1159,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 @@ -1106,12 +1188,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 @@ -1142,15 +1226,18 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1175,15 +1262,18 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1217,15 +1307,18 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1250,15 +1343,18 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1291,12 +1387,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 @@ -1318,12 +1416,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 @@ -1354,15 +1454,18 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1387,15 +1490,18 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1429,15 +1535,18 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1462,15 +1571,18 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1504,18 +1616,21 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1539,18 +1654,21 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1583,12 +1701,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 @@ -1610,12 +1730,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 @@ -1646,15 +1768,18 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1679,15 +1804,18 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1721,18 +1849,21 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1756,18 +1887,21 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1801,18 +1935,21 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1836,18 +1973,21 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1881,15 +2021,18 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1914,15 +2057,18 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1956,18 +2102,21 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1991,18 +2140,21 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -2030,8 +2182,10 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2052,8 +2206,10 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2081,8 +2237,10 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2103,8 +2261,10 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2132,8 +2292,10 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2154,8 +2316,10 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2183,8 +2347,10 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2205,8 +2371,10 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2234,8 +2402,10 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2256,8 +2426,10 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2285,8 +2457,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2307,8 +2481,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2336,8 +2512,10 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2358,8 +2536,10 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2387,8 +2567,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2409,8 +2591,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2438,8 +2622,10 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -2456,8 +2642,10 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -2481,8 +2669,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2501,8 +2691,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2529,8 +2721,10 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16 ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 4 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_V1 @@ -2547,8 +2741,10 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 4 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2574,8 +2770,10 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 8 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_V2 @@ -2594,8 +2792,10 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 ; GFX10NSA: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 8 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2618,8 +2818,10 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2640,8 +2842,10 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2669,8 +2873,10 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2691,8 +2897,10 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2720,8 +2928,10 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2742,8 +2952,10 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2776,8 +2988,10 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_glc @@ -2798,8 +3012,10 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2827,8 +3043,10 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_slc @@ -2849,8 +3067,10 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2878,8 +3098,10 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_glc_slc @@ -2900,8 +3122,10 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2970,8 +3194,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -2994,8 +3220,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -3028,12 +3256,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) @@ -3057,12 +3287,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) @@ -3098,15 +3330,18 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -3133,15 +3368,18 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -3180,18 +3418,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -3217,18 +3458,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll index a8ab9c7582751..d2c89ab19927e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll @@ -768,13 +768,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_0000(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; UNPACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %16(<4 x s16>), %16(<4 x s16>) + ; UNPACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; UNPACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 - ; UNPACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 - ; UNPACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; UNPACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_v3f16_dmask_0000 ; PACKED: bb.1 (%ir-block.0): @@ -790,13 +791,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_0000(<8 x i32> inreg %rsrc, ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; PACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %16(<4 x s16>), %16(<4 x s16>) + ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 - ; PACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 - ; PACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %tex diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir index 8860ca6ba5e58..b75ec76d7ff44 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -30,8 +30,10 @@ body: | ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s32>) + ; GCN: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; GCN: S_ENDPGM 0, implicit [[UV]](<3 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -49,8 +51,10 @@ body: | ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x p3>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x p3>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x p3>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x p3>) + ; GCN: [[DEF:%[0-9]+]]:_(<4 x p3>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x p3>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x p3>), [[DEF]](<4 x p3>), [[DEF]](<4 x p3>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x p3>), [[UV1:%[0-9]+]]:_(<3 x p3>), [[UV2:%[0-9]+]]:_(<3 x p3>), [[UV3:%[0-9]+]]:_(<3 x p3>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x p3>) + ; GCN: S_ENDPGM 0, implicit [[UV]](<3 x p3>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x p3>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -67,9 +71,12 @@ body: | ; GCN-LABEL: name: s_buffer_load_v6s16 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s16>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s16>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<8 x s16>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s16>) + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; GCN: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[UV]](<3 x s32>) + ; GCN: S_ENDPGM 0, implicit [[BITCAST]](<6 x s16>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<6 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -87,8 +94,10 @@ body: | ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 24, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s32>) + ; GCN: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<24 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>), [[DEF]](<8 x s32>), [[DEF]](<8 x s32>) + ; GCN: [[UV:%[0-9]+]]:_(<6 x s32>), [[UV1:%[0-9]+]]:_(<6 x s32>), [[UV2:%[0-9]+]]:_(<6 x s32>), [[UV3:%[0-9]+]]:_(<6 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<24 x s32>) + ; GCN: S_ENDPGM 0, implicit [[UV]](<6 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<6 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -106,8 +115,10 @@ body: | ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s64>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 24, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s64>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s64>) + ; GCN: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s64>), [[DEF]](<4 x s64>), [[DEF]](<4 x s64>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) + ; GCN: S_ENDPGM 0, implicit [[UV]](<3 x s64>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x s64>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -124,13 +135,85 @@ body: | ; GCN-LABEL: name: s_buffer_load_v12s8 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<16 x s8>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<12 x s8>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<16 x s8>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<12 x s8>) + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; GCN: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](<3 x s32>) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GCN: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C1]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GCN: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C2]](s32) + ; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GCN: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C3]](s32) + ; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV4]](s32) + ; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GCN: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C1]](s32) + ; GCN: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C2]](s32) + ; GCN: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32) + ; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV5]](s32) + ; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; GCN: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C1]](s32) + ; GCN: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C2]](s32) + ; GCN: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C3]](s32) + ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UV6]](s32) + ; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C4]] + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GCN: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; GCN: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; GCN: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]] + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GCN: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; GCN: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]] + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32) + ; GCN: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C4]] + ; GCN: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GCN: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GCN: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32) + ; GCN: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C4]] + ; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) + ; GCN: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C4]] + ; GCN: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GCN: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GCN: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) + ; GCN: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C4]] + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) + ; GCN: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C4]] + ; GCN: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; GCN: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GCN: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) + ; GCN: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C4]] + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; GCN: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C4]] + ; GCN: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; GCN: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; GCN: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; GCN: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS1]](<12 x s16>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<12 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 - S_ENDPGM 0, implicit %2 + %3:_(<12 x s16>) = G_ANYEXT %2 + S_ENDPGM 0, implicit %3 ... @@ -144,8 +227,10 @@ body: | ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 - ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s32>) + ; GCN: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; GCN: S_ENDPGM 0, implicit [[UV]](<3 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir index d91886d1626da..739b76cad892c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir @@ -5884,37 +5884,42 @@ body: | ; CI-LABEL: name: test_load_constant_v3s16_align8 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load 6, align 8, addrspace 4) - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; VI-LABEL: name: test_load_constant_v3s16_align8 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load 6, align 8, addrspace 4) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; GFX9-LABEL: name: test_load_constant_v3s16_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load 6, align 8, addrspace 4) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; CI-MESA-LABEL: name: test_load_constant_v3s16_align8 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load 6, align 8, addrspace 4) - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; GFX9-MESA-LABEL: name: test_load_constant_v3s16_align8 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load 6, align 8, addrspace 4) - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 8, addrspace 4) @@ -5992,15 +5997,18 @@ body: | ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 + 4, addrspace 4) ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_constant_v3s16_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -6022,15 +6030,18 @@ body: | ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 + 4, addrspace 4) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_constant_v3s16_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -6046,15 +6057,18 @@ body: | ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 + 4, addrspace 4) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-MESA-LABEL: name: test_load_constant_v3s16_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -6076,15 +6090,18 @@ body: | ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 + 4, addrspace 4) ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-MESA-LABEL: name: test_load_constant_v3s16_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -6100,15 +6117,18 @@ body: | ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 + 4, addrspace 4) ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; GFX9-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 2, addrspace 4) @@ -6172,15 +6192,18 @@ body: | ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_constant_v3s16_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -6225,15 +6248,18 @@ body: | ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_constant_v3s16_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -6275,15 +6301,18 @@ body: | ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-MESA-LABEL: name: test_load_constant_v3s16_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -6334,15 +6363,18 @@ body: | ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; CI-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-MESA-LABEL: name: test_load_constant_v3s16_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -6384,15 +6416,18 @@ body: | ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 - ; GFX9-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 1, addrspace 4) @@ -8262,37 +8297,42 @@ body: | ; CI-LABEL: name: test_load_constant_v3s64_align32 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load 24, align 32, addrspace 4) - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; CI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; CI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; VI-LABEL: name: test_load_constant_v3s64_align32 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load 24, align 32, addrspace 4) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; GFX9-LABEL: name: test_load_constant_v3s64_align32 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load 24, align 32, addrspace 4) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; CI-MESA-LABEL: name: test_load_constant_v3s64_align32 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load 24, align 32, addrspace 4) - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; GFX9-MESA-LABEL: name: test_load_constant_v3s64_align32 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load 24, align 32, addrspace 4) - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<3 x s64>) = G_LOAD %0 :: (load 24, align 32, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index a3f43848e8feb..034f3903c6538 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -5924,37 +5924,42 @@ body: | ; CI-LABEL: name: test_load_flat_v3s16_align8 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load 6, align 8) - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; VI-LABEL: name: test_load_flat_v3s16_align8 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load 6, align 8) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; GFX9-LABEL: name: test_load_flat_v3s16_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load 6, align 8) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; CI-MESA-LABEL: name: test_load_flat_v3s16_align8 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load 6, align 8) - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; GFX9-MESA-LABEL: name: test_load_flat_v3s16_align8 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load 6, align 8) - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 8, addrspace 0) @@ -6032,15 +6037,18 @@ body: | ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load 2 + 4) ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_flat_v3s16_align2 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -6062,15 +6070,18 @@ body: | ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load 2 + 4) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_flat_v3s16_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -6086,15 +6097,18 @@ body: | ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load 2 + 4) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-MESA-LABEL: name: test_load_flat_v3s16_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -6116,15 +6130,18 @@ body: | ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load 2 + 4) ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-MESA-LABEL: name: test_load_flat_v3s16_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -6140,15 +6157,18 @@ body: | ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load 2 + 4) ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; GFX9-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 2, addrspace 0) @@ -6212,15 +6232,18 @@ body: | ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_flat_v3s16_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -6265,15 +6288,18 @@ body: | ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_flat_v3s16_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -6315,15 +6341,18 @@ body: | ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-MESA-LABEL: name: test_load_flat_v3s16_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -6374,15 +6403,18 @@ body: | ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; CI-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-MESA-LABEL: name: test_load_flat_v3s16_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -6424,15 +6456,18 @@ body: | ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 - ; GFX9-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 1, addrspace 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir index a8e3aa5c8c7ce..8169aa76bfa6e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir @@ -1203,8 +1203,10 @@ body: | ; SI-LABEL: name: test_load_global_s96_align16 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 12, align 16, addrspace 1) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[LOAD]](<4 x s32>), 0 - ; SI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[EXTRACT]](<3 x s32>) + ; SI: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; SI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[UV]](<3 x s32>) ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; CI-HSA-LABEL: name: test_load_global_s96_align16 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -5929,44 +5931,50 @@ body: | ; SI-LABEL: name: test_load_global_v3s16_align8 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load 6, align 8, addrspace 1) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; CI-HSA-LABEL: name: test_load_global_v3s16_align8 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load 6, align 8, addrspace 1) - ; CI-HSA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; CI-HSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-HSA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-HSA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-HSA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CI-HSA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CI-HSA: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; CI-MESA-LABEL: name: test_load_global_v3s16_align8 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load 6, align 8, addrspace 1) - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; VI-LABEL: name: test_load_global_v3s16_align8 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load 6, align 8, addrspace 1) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; GFX9-HSA-LABEL: name: test_load_global_v3s16_align8 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load 6, align 8, addrspace 1) - ; GFX9-HSA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; GFX9-HSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-HSA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-HSA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-HSA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-HSA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9-HSA: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align8 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load 6, align 8, addrspace 1) - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 8, addrspace 1) @@ -6056,15 +6064,18 @@ body: | ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load 2 + 4, addrspace 1) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-HSA-LABEL: name: test_load_global_v3s16_align2 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -6092,15 +6103,18 @@ body: | ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load 2 + 4, addrspace 1) ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_global_v3s16_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -6122,15 +6136,18 @@ body: | ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load 2 + 4, addrspace 1) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-HSA-LABEL: name: test_load_global_v3s16_align2 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -6152,15 +6169,18 @@ body: | ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load 2 + 4, addrspace 1) ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; GFX9-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 2, addrspace 1) @@ -6224,15 +6244,18 @@ body: | ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-HSA-LABEL: name: test_load_global_v3s16_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -6289,15 +6312,18 @@ body: | ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; CI-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_global_v3s16_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -6342,15 +6368,18 @@ body: | ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-HSA-LABEL: name: test_load_global_v3s16_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -6398,15 +6427,18 @@ body: | ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9-MESA: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9-MESA: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9-MESA: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9-MESA: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 - ; GFX9-MESA: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9-MESA: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9-MESA: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 1, addrspace 1) @@ -7036,8 +7068,10 @@ body: | ; SI-LABEL: name: test_load_global_v3s32_align16 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 12, align 16, addrspace 1) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[LOAD]](<4 x s32>), 0 - ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[EXTRACT]](<3 x s32>) + ; SI: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[UV]](<3 x s32>) ; CI-HSA-LABEL: name: test_load_global_v3s32_align16 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load 12, align 16, addrspace 1) @@ -8167,44 +8201,50 @@ body: | ; SI-LABEL: name: test_load_global_v3s64_align32 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load 24, align 32, addrspace 1) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; SI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; CI-HSA-LABEL: name: test_load_global_v3s64_align32 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load 24, align 32, addrspace 1) - ; CI-HSA: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; CI-HSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; CI-HSA: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; CI-HSA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; CI-HSA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; CI-HSA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; CI-HSA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; CI-MESA-LABEL: name: test_load_global_v3s64_align32 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load 24, align 32, addrspace 1) - ; CI-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; CI-MESA: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; VI-LABEL: name: test_load_global_v3s64_align32 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load 24, align 32, addrspace 1) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; GFX9-HSA-LABEL: name: test_load_global_v3s64_align32 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load 24, align 32, addrspace 1) - ; GFX9-HSA: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; GFX9-HSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; GFX9-HSA: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; GFX9-HSA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX9-HSA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; GFX9-HSA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; GFX9-HSA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) ; GFX9-MESA-LABEL: name: test_load_global_v3s64_align32 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load 24, align 32, addrspace 1) - ; GFX9-MESA: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[LOAD]](<4 x s64>), 0 + ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s64>) = G_CONCAT_VECTORS [[LOAD]](<4 x s64>), %2(<4 x s64>), %2(<4 x s64>) + ; GFX9-MESA: [[UV:%[0-9]+]]:_(<3 x s64>), [[UV1:%[0-9]+]]:_(<3 x s64>), [[UV2:%[0-9]+]]:_(<3 x s64>), [[UV3:%[0-9]+]]:_(<3 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s64>) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s64>), 0 + ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[UV]](<3 x s64>), 0 ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s64>) = G_LOAD %0 :: (load 24, align 32, addrspace 1) @@ -12643,23 +12683,25 @@ body: | ; SI-LABEL: name: test_extload_global_v2s96_from_24_align16 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 12, align 16, addrspace 1) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[LOAD]](<4 x s32>), 0 - ; SI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[EXTRACT]](<3 x s32>) + ; SI: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s32>), [[UV1:%[0-9]+]]:_(<3 x s32>), [[UV2:%[0-9]+]]:_(<3 x s32>), [[UV3:%[0-9]+]]:_(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; SI: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[UV]](<3 x s32>) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; SI: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 8 + 12, align 4, addrspace 1) ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; SI: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C1]](s64) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load 4 + 20, addrspace 1) - ; SI: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD1]](<2 x s32>), 0 + ; SI: [[DEF1:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; SI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF1]], [[LOAD1]](<2 x s32>), 0 ; SI: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD2]](s32), 64 ; SI: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s96>) = G_BUILD_VECTOR [[BITCAST]](s96), [[BITCAST1]](s96) - ; SI: [[EXTRACT1:%[0-9]+]]:_(s96) = G_EXTRACT [[BUILD_VECTOR]](<2 x s96>), 0 - ; SI: [[EXTRACT2:%[0-9]+]]:_(s96) = G_EXTRACT [[BUILD_VECTOR]](<2 x s96>), 96 - ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[EXTRACT1]](s96) - ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[EXTRACT2]](s96) + ; SI: [[EXTRACT:%[0-9]+]]:_(s96) = G_EXTRACT [[BUILD_VECTOR]](<2 x s96>), 0 + ; SI: [[EXTRACT1:%[0-9]+]]:_(s96) = G_EXTRACT [[BUILD_VECTOR]](<2 x s96>), 96 + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[EXTRACT]](s96) + ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[EXTRACT1]](s96) ; CI-HSA-LABEL: name: test_extload_global_v2s96_from_24_align16 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load 12, align 16, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir index f5a852bd98f36..376339482f5a1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -6061,37 +6061,42 @@ body: | ; SI-LABEL: name: test_load_local_v3s16_align8 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 6, align 8, addrspace 3) - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; CI-LABEL: name: test_load_local_v3s16_align8 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 6, align 8, addrspace 3) - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; CI-DS128-LABEL: name: test_load_local_v3s16_align8 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 6, align 8, addrspace 3) - ; CI-DS128: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; CI-DS128: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-DS128: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-DS128: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-DS128: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CI-DS128: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; VI-LABEL: name: test_load_local_v3s16_align8 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 6, align 8, addrspace 3) - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) ; GFX9-LABEL: name: test_load_local_v3s16_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load 6, align 8, addrspace 3) - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[LOAD]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[LOAD]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 8, addrspace 3) @@ -6126,15 +6131,18 @@ body: | ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 + 4, addrspace 3) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-LABEL: name: test_load_local_v3s16_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6156,15 +6164,18 @@ body: | ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 + 4, addrspace 3) ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-DS128-LABEL: name: test_load_local_v3s16_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6186,15 +6197,18 @@ body: | ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 + 4, addrspace 3) ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-DS128: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-DS128: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI-DS128: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-DS128: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-DS128: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-DS128: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI-DS128: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI-DS128: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI-DS128: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI-DS128: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-DS128: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI-DS128: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI-DS128: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI-DS128: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI-DS128: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI-DS128: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-DS128: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI-DS128: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI-DS128: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_local_v3s16_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6216,15 +6230,18 @@ body: | ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 + 4, addrspace 3) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_local_v3s16_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6240,15 +6257,18 @@ body: | ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 2 + 4, addrspace 3) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 2, addrspace 3) @@ -6312,15 +6332,18 @@ body: | ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-LABEL: name: test_load_local_v3s16_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6371,15 +6394,18 @@ body: | ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-DS128-LABEL: name: test_load_local_v3s16_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6430,15 +6456,18 @@ body: | ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-DS128: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-DS128: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI-DS128: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI-DS128: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI-DS128: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI-DS128: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI-DS128: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI-DS128: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI-DS128: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI-DS128: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-DS128: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI-DS128: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI-DS128: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; CI-DS128: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI-DS128: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI-DS128: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI-DS128: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI-DS128: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI-DS128: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_local_v3s16_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6483,15 +6512,18 @@ body: | ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_local_v3s16_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6533,15 +6565,18 @@ body: | ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 1, addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index 58f13f172d9ab..085cb619570d2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -5027,15 +5027,18 @@ body: | ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 2 + 4, align 4, addrspace 5) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[LOAD]](<2 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-LABEL: name: test_load_private_v3s16_align8 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5045,15 +5048,18 @@ body: | ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 2 + 4, align 4, addrspace 5) ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[LOAD]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_private_v3s16_align8 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5063,15 +5069,18 @@ body: | ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 2 + 4, align 4, addrspace 5) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[LOAD]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_private_v3s16_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5081,15 +5090,18 @@ body: | ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load 2 + 4, align 4, addrspace 5) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[LOAD]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 8, addrspace 5) @@ -5124,15 +5136,18 @@ body: | ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 2 + 4, addrspace 5) ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-LABEL: name: test_load_private_v3s16_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5154,15 +5169,18 @@ body: | ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 2 + 4, addrspace 5) ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_private_v3s16_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5184,15 +5202,18 @@ body: | ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 2 + 4, addrspace 5) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_private_v3s16_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5208,15 +5229,18 @@ body: | ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load 2 + 4, addrspace 5) ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[TRUNC]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 2, addrspace 5) @@ -5280,15 +5304,18 @@ body: | ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; SI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; SI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; SI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CI-LABEL: name: test_load_private_v3s16_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5339,15 +5366,18 @@ body: | ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; CI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; CI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; CI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; CI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; CI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; CI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; VI-LABEL: name: test_load_private_v3s16_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5392,15 +5422,18 @@ body: | ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BITCAST]](<2 x s16>), 0 - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; VI: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR3]](s16), 32 - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; VI: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; VI: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; GFX9-LABEL: name: test_load_private_v3s16_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5442,15 +5475,18 @@ body: | ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C2]](s16) ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), %2(<4 x s16>), %2(<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT]], [[BUILD_VECTOR_TRUNC]](<2 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT1]](<4 x s16>), 0 - ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT1]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT2]], [[OR2]](s16), 32 - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT3]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT2]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT3]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load 6, align 1, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir index f5f7baeba73a4..4c69ff2edfa48 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir @@ -726,14 +726,17 @@ body: | ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[EXTRACT2]], [[EXTRACT4]](<2 x s16>) ; GFX9: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[EXTRACT3]], [[EXTRACT5]](s16) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT6:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT6]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT4]], [[LSHR]](<2 x s16>), 0 - ; GFX9: [[EXTRACT7:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT5]](<4 x s16>), 0 - ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT7]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT5]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT7:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT6]], [[LSHR1]](s16), 32 - ; GFX9: [[EXTRACT8:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT7]](<4 x s16>), 0 - ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT8]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT7]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT8]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir index b4a5c48eb2e68..a91139fc7fd50 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir @@ -361,15 +361,18 @@ body: | ; CHECK-LABEL: name: test_or_v3s16 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CHECK: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[INSERT]], [[INSERT1]] - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[OR]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT2]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[OR]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF @@ -403,27 +406,32 @@ body: | ; CHECK-LABEL: name: test_or_v5s16 ; CHECK: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[DEF]](<6 x s16>), 0 ; CHECK: [[DEF1:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[DEF1]](<6 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[DEF]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>), [[UV2:%[0-9]+]]:_(<5 x s16>), [[UV3:%[0-9]+]]:_(<5 x s16>), [[UV4:%[0-9]+]]:_(<5 x s16>), [[UV5:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<30 x s16>) ; CHECK: [[DEF2:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<5 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<5 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT]](<6 x s16>) - ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[DEF2]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV6:%[0-9]+]]:_(<5 x s16>), [[UV7:%[0-9]+]]:_(<5 x s16>), [[UV8:%[0-9]+]]:_(<5 x s16>), [[UV9:%[0-9]+]]:_(<5 x s16>), [[UV10:%[0-9]+]]:_(<5 x s16>), [[UV11:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<30 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[UV]](<5 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[UV6]](<5 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT]](<6 x s16>) + ; CHECK: [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) ; CHECK: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV]](<3 x s16>), 0 - ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV12]](<3 x s16>), 0 + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV14]](<3 x s16>), 0 ; CHECK: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[INSERT2]], [[INSERT3]] - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[OR]](<4 x s16>), 0 - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV1]](<3 x s16>), 0 - ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV3]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[OR]](<4 x s16>), [[DEF3]](<4 x s16>), [[DEF3]](<4 x s16>) + ; CHECK: [[UV16:%[0-9]+]]:_(<3 x s16>), [[UV17:%[0-9]+]]:_(<3 x s16>), [[UV18:%[0-9]+]]:_(<3 x s16>), [[UV19:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV13]](<3 x s16>), 0 + ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV15]](<3 x s16>), 0 ; CHECK: [[OR1:%[0-9]+]]:_(<4 x s16>) = G_OR [[INSERT4]], [[INSERT5]] - ; CHECK: [[EXTRACT3:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[OR1]](<4 x s16>), 0 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT2]](<3 x s16>), [[EXTRACT3]](<3 x s16>) - ; CHECK: [[EXTRACT4:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[OR1]](<4 x s16>), [[DEF3]](<4 x s16>), [[DEF3]](<4 x s16>) + ; CHECK: [[UV20:%[0-9]+]]:_(<3 x s16>), [[UV21:%[0-9]+]]:_(<3 x s16>), [[UV22:%[0-9]+]]:_(<3 x s16>), [[UV23:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; CHECK: [[CONCAT_VECTORS4:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV16]](<3 x s16>), [[UV20]](<3 x s16>) + ; CHECK: [[CONCAT_VECTORS5:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS4]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV24:%[0-9]+]]:_(<5 x s16>), [[UV25:%[0-9]+]]:_(<5 x s16>), [[UV26:%[0-9]+]]:_(<5 x s16>), [[UV27:%[0-9]+]]:_(<5 x s16>), [[UV28:%[0-9]+]]:_(<5 x s16>), [[UV29:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS5]](<30 x s16>) ; CHECK: [[DEF4:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT6:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF4]], [[EXTRACT4]](<5 x s16>), 0 + ; CHECK: [[INSERT6:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF4]], [[UV24]](<5 x s16>), 0 ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT6]](<8 x s16>) %0:_(<5 x s16>) = G_IMPLICIT_DEF %1:_(<5 x s16>) = G_IMPLICIT_DEF @@ -463,8 +471,9 @@ body: | ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ANYEXT6]], [[ANYEXT7]] ; CHECK: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[OR3]](s32) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s8>) = G_EXTRACT [[BUILD_VECTOR]](<4 x s8>), 0 - ; CHECK: [[ANYEXT8:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[EXTRACT]](<3 x s8>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s8>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<4 x s8>), [[DEF2]](<4 x s8>), [[DEF2]](<4 x s8>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s8>), [[UV9:%[0-9]+]]:_(<3 x s8>), [[UV10:%[0-9]+]]:_(<3 x s8>), [[UV11:%[0-9]+]]:_(<3 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s8>) + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[UV8]](<3 x s8>) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[ANYEXT8]](<3 x s32>) %0:_(<3 x s8>) = G_IMPLICIT_DEF %1:_(<3 x s8>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir index 81408b79b11f7..0b1f7be76a0fa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir @@ -174,9 +174,10 @@ body: | ; CHECK: G_BR %bb.2 ; CHECK: bb.2: ; CHECK: [[PHI:%[0-9]+]]:_(<4 x s16>) = G_PHI [[INSERT]](<4 x s16>), %bb.0, [[INSERT3]](<4 x s16>), %bb.1 - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[PHI]](<4 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[PHI]](<4 x s16>), %7(<4 x s16>), %7(<4 x s16>) + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT2]](<3 x s16>), 0 + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[UV4]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT4]](<4 x s16>) ; CHECK: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -651,24 +652,21 @@ body: | ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) - ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) - ; CHECK: [[COPY2:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr4 ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] ; CHECK: G_BRCOND [[ICMP]](s1), %bb.1 ; CHECK: G_BR %bb.2 ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>) - ; CHECK: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) - ; CHECK: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) - ; CHECK: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<16 x s32>) + ; CHECK: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>) + ; CHECK: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>) + ; CHECK: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>) ; CHECK: [[UV64:%[0-9]+]]:_(s32), [[UV65:%[0-9]+]]:_(s32), [[UV66:%[0-9]+]]:_(s32), [[UV67:%[0-9]+]]:_(s32), [[UV68:%[0-9]+]]:_(s32), [[UV69:%[0-9]+]]:_(s32), [[UV70:%[0-9]+]]:_(s32), [[UV71:%[0-9]+]]:_(s32), [[UV72:%[0-9]+]]:_(s32), [[UV73:%[0-9]+]]:_(s32), [[UV74:%[0-9]+]]:_(s32), [[UV75:%[0-9]+]]:_(s32), [[UV76:%[0-9]+]]:_(s32), [[UV77:%[0-9]+]]:_(s32), [[UV78:%[0-9]+]]:_(s32), [[UV79:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>) - ; CHECK: [[UV80:%[0-9]+]]:_(s32), [[UV81:%[0-9]+]]:_(s32), [[UV82:%[0-9]+]]:_(s32), [[UV83:%[0-9]+]]:_(s32), [[UV84:%[0-9]+]]:_(s32), [[UV85:%[0-9]+]]:_(s32), [[UV86:%[0-9]+]]:_(s32), [[UV87:%[0-9]+]]:_(s32), [[UV88:%[0-9]+]]:_(s32), [[UV89:%[0-9]+]]:_(s32), [[UV90:%[0-9]+]]:_(s32), [[UV91:%[0-9]+]]:_(s32), [[UV92:%[0-9]+]]:_(s32), [[UV93:%[0-9]+]]:_(s32), [[UV94:%[0-9]+]]:_(s32), [[UV95:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) - ; CHECK: [[UV96:%[0-9]+]]:_(s32), [[UV97:%[0-9]+]]:_(s32), [[UV98:%[0-9]+]]:_(s32), [[UV99:%[0-9]+]]:_(s32), [[UV100:%[0-9]+]]:_(s32), [[UV101:%[0-9]+]]:_(s32), [[UV102:%[0-9]+]]:_(s32), [[UV103:%[0-9]+]]:_(s32), [[UV104:%[0-9]+]]:_(s32), [[UV105:%[0-9]+]]:_(s32), [[UV106:%[0-9]+]]:_(s32), [[UV107:%[0-9]+]]:_(s32), [[UV108:%[0-9]+]]:_(s32), [[UV109:%[0-9]+]]:_(s32), [[UV110:%[0-9]+]]:_(s32), [[UV111:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) - ; CHECK: [[UV112:%[0-9]+]]:_(s32), [[UV113:%[0-9]+]]:_(s32), [[UV114:%[0-9]+]]:_(s32), [[UV115:%[0-9]+]]:_(s32), [[UV116:%[0-9]+]]:_(s32), [[UV117:%[0-9]+]]:_(s32), [[UV118:%[0-9]+]]:_(s32), [[UV119:%[0-9]+]]:_(s32), [[UV120:%[0-9]+]]:_(s32), [[UV121:%[0-9]+]]:_(s32), [[UV122:%[0-9]+]]:_(s32), [[UV123:%[0-9]+]]:_(s32), [[UV124:%[0-9]+]]:_(s32), [[UV125:%[0-9]+]]:_(s32), [[UV126:%[0-9]+]]:_(s32), [[UV127:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<16 x s32>) + ; CHECK: [[UV80:%[0-9]+]]:_(s32), [[UV81:%[0-9]+]]:_(s32), [[UV82:%[0-9]+]]:_(s32), [[UV83:%[0-9]+]]:_(s32), [[UV84:%[0-9]+]]:_(s32), [[UV85:%[0-9]+]]:_(s32), [[UV86:%[0-9]+]]:_(s32), [[UV87:%[0-9]+]]:_(s32), [[UV88:%[0-9]+]]:_(s32), [[UV89:%[0-9]+]]:_(s32), [[UV90:%[0-9]+]]:_(s32), [[UV91:%[0-9]+]]:_(s32), [[UV92:%[0-9]+]]:_(s32), [[UV93:%[0-9]+]]:_(s32), [[UV94:%[0-9]+]]:_(s32), [[UV95:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>) + ; CHECK: [[UV96:%[0-9]+]]:_(s32), [[UV97:%[0-9]+]]:_(s32), [[UV98:%[0-9]+]]:_(s32), [[UV99:%[0-9]+]]:_(s32), [[UV100:%[0-9]+]]:_(s32), [[UV101:%[0-9]+]]:_(s32), [[UV102:%[0-9]+]]:_(s32), [[UV103:%[0-9]+]]:_(s32), [[UV104:%[0-9]+]]:_(s32), [[UV105:%[0-9]+]]:_(s32), [[UV106:%[0-9]+]]:_(s32), [[UV107:%[0-9]+]]:_(s32), [[UV108:%[0-9]+]]:_(s32), [[UV109:%[0-9]+]]:_(s32), [[UV110:%[0-9]+]]:_(s32), [[UV111:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>) + ; CHECK: [[UV112:%[0-9]+]]:_(s32), [[UV113:%[0-9]+]]:_(s32), [[UV114:%[0-9]+]]:_(s32), [[UV115:%[0-9]+]]:_(s32), [[UV116:%[0-9]+]]:_(s32), [[UV117:%[0-9]+]]:_(s32), [[UV118:%[0-9]+]]:_(s32), [[UV119:%[0-9]+]]:_(s32), [[UV120:%[0-9]+]]:_(s32), [[UV121:%[0-9]+]]:_(s32), [[UV122:%[0-9]+]]:_(s32), [[UV123:%[0-9]+]]:_(s32), [[UV124:%[0-9]+]]:_(s32), [[UV125:%[0-9]+]]:_(s32), [[UV126:%[0-9]+]]:_(s32), [[UV127:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<16 x s32>) ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[UV64]] ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UV65]] ; CHECK: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[UV2]], [[UV66]] @@ -740,9 +738,9 @@ body: | ; CHECK: G_BR %bb.2 ; CHECK: bb.2: ; CHECK: [[PHI:%[0-9]+]]:_(<16 x s32>) = G_PHI [[DEF]](<16 x s32>), %bb.0, [[BUILD_VECTOR]](<16 x s32>), %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:_(<16 x s32>) = G_PHI [[COPY]](<16 x s32>), %bb.0, [[BUILD_VECTOR1]](<16 x s32>), %bb.1 - ; CHECK: [[PHI2:%[0-9]+]]:_(<16 x s32>) = G_PHI [[COPY1]](<16 x s32>), %bb.0, [[BUILD_VECTOR2]](<16 x s32>), %bb.1 - ; CHECK: [[PHI3:%[0-9]+]]:_(<16 x s32>) = G_PHI [[COPY2]](<16 x s32>), %bb.0, [[BUILD_VECTOR3]](<16 x s32>), %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:_(<16 x s32>) = G_PHI [[DEF]](<16 x s32>), %bb.0, [[BUILD_VECTOR1]](<16 x s32>), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:_(<16 x s32>) = G_PHI [[DEF]](<16 x s32>), %bb.0, [[BUILD_VECTOR2]](<16 x s32>), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:_(<16 x s32>) = G_PHI [[DEF]](<16 x s32>), %bb.0, [[BUILD_VECTOR3]](<16 x s32>), %bb.1 ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[PHI]](<16 x s32>), [[PHI1]](<16 x s32>), [[PHI2]](<16 x s32>), [[PHI3]](<16 x s32>) ; CHECK: S_SETPC_B64 undef $sgpr30_sgpr31, implicit [[CONCAT_VECTORS]](<64 x s32>) bb.0: @@ -1547,3 +1545,94 @@ body: | S_SETPC_B64 undef $sgpr30_sgpr31 ... +--- +name: test_phi_v2s1 +tracksRegLiveness: true + +body: | + ; CHECK-LABEL: name: test_phi_v2s1 + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[AND1]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND2]](s32), [[AND3]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: G_BRCOND [[ICMP2]](s1), %bb.1 + ; CHECK: G_BR %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C3]](s32) + ; CHECK: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]] + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND4]](s32), [[AND5]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C4]] + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C4]] + ; CHECK: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND6]](s32), [[AND7]] + ; CHECK: G_BR %bb.2 + ; CHECK: bb.2: + ; CHECK: [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP3]](s1), %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.0, [[ICMP4]](s1), %bb.1 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s1) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI1]](s1) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[ANYEXT]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[ANYEXT1]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C5]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: S_SETPC_B64 undef $sgpr30_sgpr31 + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $vgpr1 + %2:_(<2 x s16>) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr1 + %4:_(s32) = G_CONSTANT i32 0 + %5:_(<2 x s1>) = G_ICMP intpred(eq), %0, %1 + %6:_(s1) = G_ICMP intpred(eq), %3, %4 + G_BRCOND %6, %bb.1 + G_BR %bb.2 + + bb.1: + successors: %bb.2 + + %7:_(<2 x s1>) = G_ICMP intpred(ne), %0, %2 + G_BR %bb.2 + + bb.2: + %8:_(<2 x s1>) = G_PHI %5, %bb.0, %7, %bb.1 + %9:_(<2 x s32>) = G_ZEXT %8 + $vgpr0_vgpr1 = COPY %9 + S_SETPC_B64 undef $sgpr30_sgpr31 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir index 709e9a24493c6..e014376446a94 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptr-add.mir @@ -143,3 +143,237 @@ body: | %2:_(p999) = G_PTR_ADD %0, %1 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_gep_v2p1_v2i64 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: test_gep_v2p1_v2i64 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[UV:%[0-9]+]]:_(p1), [[UV1:%[0-9]+]]:_(p1) = G_UNMERGE_VALUES [[COPY]](<2 x p1>) + ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV]], [[UV2]](s64) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV1]], [[UV3]](s64) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[PTR_ADD]](p1), [[PTR_ADD1]](p1) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + %0:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + %2:_(<2 x p1>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 +... + +--- +name: test_gep_v2p3_v2s32 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_gep_v2p3_v2s32 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV]], [[UV2]](s32) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV1]], [[UV3]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[PTR_ADD]](p3), [[PTR_ADD1]](p3) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + %0:_(<2 x p3>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(<2 x p3>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: test_gep_global_s16_idx +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_gep_global_s16_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ANYEXT]], 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[SEXT_INREG]](s64) + ; CHECK: $vgpr0_vgpr1 = COPY [[PTR_ADD]](p1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(p1) = G_PTR_ADD %0, %2 + $vgpr0_vgpr1 = COPY %3 +... + +--- +name: test_gep_global_s32_idx +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_gep_global_s32_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[SEXT]](s64) + ; CHECK: $vgpr0_vgpr1 = COPY [[PTR_ADD]](p1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(p1) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: test_gep_global_s96_idx +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 + + ; CHECK-LABEL: name: test_gep_global_s96_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[COPY1]](s96) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[TRUNC]](s64) + ; CHECK: $vgpr0_vgpr1 = COPY [[PTR_ADD]](p1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s96) = COPY $vgpr2_vgpr3_vgpr4 + %2:_(p1) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: test_gep_local_i16_idx +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_gep_local_i16_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[SEXT_INREG]](s32) + ; CHECK: $vgpr0 = COPY [[PTR_ADD]](p3) + %0:_(p3) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %1 + %3:_(p3) = G_PTR_ADD %0, %2 + $vgpr0 = COPY %3 +... + +--- +name: test_gep_local_i64_idx +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2 + + ; CHECK-LABEL: name: test_gep_local_i64_idx + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr1_vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[TRUNC]](s32) + ; CHECK: $vgpr0 = COPY [[PTR_ADD]](p3) + %0:_(p3) = COPY $vgpr0 + %1:_(s64) = COPY $vgpr1_vgpr2 + %2:_(p3) = G_PTR_ADD %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: test_gep_v2p1_v2i32 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: test_gep_v2p1_v2i32 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(p1), [[UV1:%[0-9]+]]:_(p1) = G_UNMERGE_VALUES [[COPY]](<2 x p1>) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[UV2]](s32) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV]], [[SEXT]](s64) + ; CHECK: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[UV3]](s32) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV1]], [[SEXT1]](s64) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[PTR_ADD]](p1), [[PTR_ADD1]](p1) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + %0:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s32>) = COPY $vgpr4_vgpr5 + %2:_(<2 x p1>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 +... + +--- +name: test_gep_v2p1_v2i96 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6, $vgpr7_vgpr8_vgpr9 + + ; CHECK-LABEL: name: test_gep_v2p1_v2i96 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr4_vgpr5_vgpr6 + ; CHECK: [[COPY2:%[0-9]+]]:_(s96) = COPY $vgpr7_vgpr8_vgpr9 + ; CHECK: [[UV:%[0-9]+]]:_(p1), [[UV1:%[0-9]+]]:_(p1) = G_UNMERGE_VALUES [[COPY]](<2 x p1>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[COPY1]](s96) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV]], [[TRUNC]](s64) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s64) = G_TRUNC [[COPY2]](s96) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[UV1]], [[TRUNC1]](s64) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[PTR_ADD]](p1), [[PTR_ADD1]](p1) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) + %0:_(<2 x p1>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s96) = COPY $vgpr4_vgpr5_vgpr6 + %2:_(s96) = COPY $vgpr7_vgpr8_vgpr9 + %3:_(<2 x s96>) = G_BUILD_VECTOR %1, %2 + %4:_(<2 x p1>) = G_PTR_ADD %0, %3 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %4 +... + +--- +name: test_gep_v2p3_v2s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_gep_v2p3_v2s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV]], [[SEXT_INREG]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV1]], [[SEXT_INREG1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[PTR_ADD]](p3), [[PTR_ADD1]](p3) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + %0:_(<2 x p3>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s16>) = COPY $vgpr2 + %2:_(<2 x p3>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: test_gep_v2p3_v2s64 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 + + ; CHECK-LABEL: name: test_gep_v2p3_v2s64 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) + ; CHECK: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[UV2]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV]], [[TRUNC]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV3]](s64) + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[UV1]], [[TRUNC1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[PTR_ADD]](p3), [[PTR_ADD1]](p3) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + %0:_(<2 x p3>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + %2:_(<2 x p3>) = G_PTR_ADD %0, %1 + $vgpr0_vgpr1 = COPY %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir index 28a8efad1d102..d96c88b027e59 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir @@ -59,17 +59,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C3]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C2]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB1]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[ADD]], [[C]](s16) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SADDSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -137,17 +128,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C3]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C2]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB1]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[ADD]], [[C]](s16) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SADDSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -282,34 +264,19 @@ body: | ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C6]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C4]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C6]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C5]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB1]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[ADD]], [[C3]](s16) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SADDSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[SMAX2:%[0-9]+]]:_(s16) = G_SMAX [[SHL2]], [[C6]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[C4]], [[SMAX2]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(s16) = G_SMIN [[SHL2]], [[C6]] - ; GFX9: [[SUB3:%[0-9]+]]:_(s16) = G_SUB [[C5]], [[SMIN2]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(s16) = G_SMAX [[SUB3]], [[SHL3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(s16) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[SHL2]], [[SMIN3]] - ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ADD1]], [[C3]](s16) - ; GFX9: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9: [[SADDSAT1:%[0-9]+]]:_(s16) = G_SADDSAT [[SHL2]], [[SHL3]] + ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SADDSAT1]], [[C3]](s16) + ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[ASHR]](s16) - ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C7]] + ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] ; GFX9: [[COPY3:%[0-9]+]]:_(s16) = COPY [[ASHR1]](s16) - ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C7]] + ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C4]] ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16) ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -375,17 +342,8 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB1]], [[TRUNC1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[SMIN1]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SADDSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -484,20 +442,8 @@ body: | ; GFX9-LABEL: name: saddsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB1]], [[COPY1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[COPY]], [[SMIN1]] - ; GFX9: $vgpr0 = COPY [[ADD]](<2 x s16>) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[SADDSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SADDSAT %0, %1 @@ -582,9 +528,11 @@ body: | ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX6: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX6: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX6: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX6: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX8-LABEL: name: saddsat_v3s16 ; GFX8: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX8: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) @@ -644,9 +592,11 @@ body: | ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX8: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX8: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX8: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX8: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX8: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: saddsat_v3s16 ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) @@ -676,34 +626,15 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C3]](s32), [[C3]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC4]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC5]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB1]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[SMIN1]] - ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C3]](s32), [[C3]](s32) - ; GFX9: [[SMAX2:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC7]], [[SMAX2]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9: [[SUB3:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC8]], [[SMIN2]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB3]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC1]], [[SMIN3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[SADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SADDSAT]](<2 x s16>), [[SADDSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 %3:_(<3 x s16>) = G_SADDSAT %1, %2 @@ -870,30 +801,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB1]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV]], [[SMIN1]] - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX2:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC3]], [[SMAX2]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SUB3:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC4]], [[SMIN2]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB3]], [[UV3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV1]], [[SMIN3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[UV]], [[UV2]] + ; GFX9: [[SADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SADDSAT]](<2 x s16>), [[SADDSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -938,17 +848,8 @@ body: | ; GFX9-LABEL: name: saddsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[COPY1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[SMIN1]] - ; GFX9: $vgpr0 = COPY [[ADD]](s32) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s32) = G_SADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[SADDSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_SADDSAT %0, %1 @@ -1014,24 +915,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[SMIN1]] - ; GFX9: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[C2]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX2]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[C2]] - ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN2]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[UV3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[SMIN3]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; GFX9: [[SADDSAT:%[0-9]+]]:_(s32) = G_SADDSAT [[UV]], [[UV2]] + ; GFX9: [[SADDSAT1:%[0-9]+]]:_(s32) = G_SADDSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SADDSAT]](s32), [[SADDSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir index 88689ae03616f..6dae4e59373c6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir @@ -291,8 +291,9 @@ body: | ; CHECK: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[ANYEXT6]], [[ANYEXT7]] ; CHECK: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[SELECT3]](s16) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s8>) = G_EXTRACT [[BUILD_VECTOR]](<4 x s8>), 0 - ; CHECK: [[ANYEXT8:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[EXTRACT]](<3 x s8>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s8>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s8>), [[UV9:%[0-9]+]]:_(<3 x s8>), [[UV10:%[0-9]+]]:_(<3 x s8>), [[UV11:%[0-9]+]]:_(<3 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s8>) + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[UV8]](<3 x s8>) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[ANYEXT8]](<3 x s32>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 0 @@ -399,8 +400,9 @@ body: | ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT1]](<3 x s16>), 0 ; CHECK: [[SELECT:%[0-9]+]]:_(<4 x s16>) = G_SELECT [[ICMP]](s1), [[INSERT]], [[INSERT1]] - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[SELECT]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT2]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[SELECT]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir index 10e32d7f87c90..2d8812d5e7e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir @@ -710,14 +710,17 @@ body: | ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[EXTRACT2]], [[EXTRACT4]](<2 x s16>) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[EXTRACT3]], [[EXTRACT5]](s16) ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT6:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT6]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; GFX9: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV]](<3 x s16>), 0 ; GFX9: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT4]], [[SHL]](<2 x s16>), 0 - ; GFX9: [[EXTRACT7:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT5]](<4 x s16>), 0 - ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT7]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT5]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT6:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV4]](<3 x s16>), 0 ; GFX9: [[INSERT7:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[INSERT6]], [[SHL1]](s16), 32 - ; GFX9: [[EXTRACT8:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[INSERT7]](<4 x s16>), 0 - ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT8]](<3 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[INSERT7]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; GFX9: [[INSERT8:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[UV8]](<3 x s16>), 0 ; GFX9: $vgpr0_vgpr1 = COPY [[INSERT8]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir index 9f614bef378ed..692078edbe65f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir @@ -217,16 +217,18 @@ body: | ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -250,15 +252,17 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir index d4de328b679a7..d4b80ce4a5721 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir @@ -45,13 +45,15 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) @@ -80,14 +82,16 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]] ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_0 @@ -115,16 +119,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -154,15 +160,17 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -191,15 +199,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -267,15 +278,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -304,13 +318,15 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) @@ -339,14 +355,16 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]] ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_2 @@ -374,16 +392,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -413,15 +433,17 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -561,15 +583,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -577,10 +602,13 @@ body: | ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -602,15 +630,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -640,15 +671,17 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -657,11 +690,13 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -683,16 +718,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -700,12 +737,14 @@ body: | ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -728,15 +767,17 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -745,11 +786,13 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -771,16 +814,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -788,12 +833,14 @@ body: | ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -933,24 +980,38 @@ body: | ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX8: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16 - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT1]](s16) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] - ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C1]](s32) + ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 0 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v4s16_1_0 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C1]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 0 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -971,24 +1032,36 @@ body: | ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX8: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16 - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 48 - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT1]](s16) + ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX8: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0 ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] - ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C]](s32) + ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 32 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v4s16_1_3 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 48 - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 32 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir index a1087058ae5d6..0350c1cc53a01 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir @@ -349,21 +349,24 @@ body: | ; SI-LABEL: name: test_smax_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; SI: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16 @@ -394,31 +397,34 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_smax_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC3]] @@ -434,24 +440,27 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_smax_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[UV2]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[UV3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV8]], [[UV10]] + ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV9]], [[UV11]] + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; GFX9: S_NOP 0, implicit [[UV12]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_SMAX %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir index 483681f18de84..0e2dbbc948bdb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir @@ -349,21 +349,24 @@ body: | ; SI-LABEL: name: test_smin_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; SI: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16 @@ -394,31 +397,34 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_smin_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC3]] @@ -434,24 +440,27 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_smin_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[UV3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV8]], [[UV10]] + ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV9]], [[UV11]] + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; GFX9: S_NOP 0, implicit [[UV12]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_SMIN %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir index 40eb12034c97b..fc1f5ab266ec0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -59,17 +59,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C3]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SMAX]], [[C1]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SMIN]], [[C2]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SUB2]], [[C]](s16) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SSUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -137,17 +128,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C3]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SMAX]], [[C1]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SMIN]], [[C2]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SUB2]], [[C]](s16) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SSUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -282,34 +264,19 @@ body: | ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[SHL]], [[C6]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SMAX]], [[C4]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[SHL]], [[C6]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SMIN]], [[C5]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB]], [[SHL1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[SMIN1]] - ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SUB2]], [[C3]](s16) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SSUBSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[SMAX2:%[0-9]+]]:_(s16) = G_SMAX [[SHL2]], [[C6]] - ; GFX9: [[SUB3:%[0-9]+]]:_(s16) = G_SUB [[SMAX2]], [[C4]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(s16) = G_SMIN [[SHL2]], [[C6]] - ; GFX9: [[SUB4:%[0-9]+]]:_(s16) = G_SUB [[SMIN2]], [[C5]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(s16) = G_SMAX [[SUB3]], [[SHL3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(s16) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX9: [[SUB5:%[0-9]+]]:_(s16) = G_SUB [[SHL2]], [[SMIN3]] - ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SUB5]], [[C3]](s16) - ; GFX9: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9: [[SSUBSAT1:%[0-9]+]]:_(s16) = G_SSUBSAT [[SHL2]], [[SHL3]] + ; GFX9: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SSUBSAT1]], [[C3]](s16) + ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[ASHR]](s16) - ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C7]] + ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] ; GFX9: [[COPY3:%[0-9]+]]:_(s16) = COPY [[ASHR1]](s16) - ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C7]] + ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C4]] ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16) ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -375,17 +342,8 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SMAX]], [[C]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SMIN]], [[C1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[SUB]], [[TRUNC1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[SMIN1]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB2]](s16) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SSUBSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -484,20 +442,8 @@ body: | ; GFX9-LABEL: name: ssubsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB]], [[COPY1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[COPY]], [[SMIN1]] - ; GFX9: $vgpr0 = COPY [[SUB2]](<2 x s16>) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[SSUBSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SSUBSAT %0, %1 @@ -582,9 +528,11 @@ body: | ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX6: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX6: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX6: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX6: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX8-LABEL: name: ssubsat_v3s16 ; GFX8: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX8: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) @@ -644,9 +592,11 @@ body: | ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX8: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX8: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX8: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX8: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX8: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: ssubsat_v3s16 ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) @@ -676,34 +626,15 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C3]](s32), [[C3]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[SMIN1]] - ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C3]](s32), [[C3]](s32) - ; GFX9: [[SMAX2:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9: [[SUB3:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX2]], [[BUILD_VECTOR_TRUNC7]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9: [[SUB4:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN2]], [[BUILD_VECTOR_TRUNC8]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB3]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX9: [[SUB5:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[SMIN3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SUB2]](<2 x s16>), [[SUB5]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[SSUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SSUBSAT]](<2 x s16>), [[SSUBSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 %3:_(<3 x s16>) = G_SSUBSAT %1, %2 @@ -870,30 +801,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV]], [[SMIN1]] - ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C2]](s32), [[C2]](s32) - ; GFX9: [[SMAX2:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SUB3:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMAX2]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[SUB4:%[0-9]+]]:_(<2 x s16>) = G_SUB [[SMIN2]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[SUB3]], [[UV3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX9: [[SUB5:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV1]], [[SMIN3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SUB2]](<2 x s16>), [[SUB5]](<2 x s16>) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[UV]], [[UV2]] + ; GFX9: [[SSUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SSUBSAT]](<2 x s16>), [[SSUBSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -938,17 +848,8 @@ body: | ; GFX9-LABEL: name: ssubsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[COPY1]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[SMIN1]] - ; GFX9: $vgpr0 = COPY [[SUB2]](s32) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s32) = G_SSUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[SSUBSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_SSUBSAT %0, %1 @@ -1014,24 +915,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[C2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]] - ; GFX9: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[C2]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]] - ; GFX9: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[UV2]] - ; GFX9: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[SMIN1]] - ; GFX9: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[C2]] - ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SMAX2]], [[C]] - ; GFX9: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[C2]] - ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SMIN2]], [[C1]] - ; GFX9: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[UV3]] - ; GFX9: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX9: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[SMIN3]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB2]](s32), [[SUB5]](s32) + ; GFX9: [[SSUBSAT:%[0-9]+]]:_(s32) = G_SSUBSAT [[UV]], [[UV2]] + ; GFX9: [[SSUBSAT1:%[0-9]+]]:_(s32) = G_SSUBSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SSUBSAT]](s32), [[SSUBSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir index 690bf34482ddd..e6a7193a3b0f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir @@ -32,11 +32,8 @@ body: | ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: uaddsat_s7 @@ -47,11 +44,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C1]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C]](s16) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -92,11 +86,8 @@ body: | ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: uaddsat_s8 @@ -107,11 +98,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C1]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C]](s16) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -187,24 +175,19 @@ body: | ; GFX8: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX8: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C4]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C3]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C3]](s16) ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX8: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX8: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[SHL2]], [[C4]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[SHL3]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[SHL2]], [[UMIN1]] - ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[ADD1]], [[C3]](s16) - ; GFX8: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL2]], [[SHL3]] + ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT1]], [[C3]](s16) + ; GFX8: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX8: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) - ; GFX8: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C5]] + ; GFX8: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] ; GFX8: [[COPY3:%[0-9]+]]:_(s16) = COPY [[LSHR7]](s16) - ; GFX8: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C5]] + ; GFX8: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C4]] ; GFX8: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16) ; GFX8: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]] ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -226,24 +209,19 @@ body: | ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SHL]], [[C4]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[SHL1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[SHL]], [[UMIN]] - ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[ADD]], [[C3]](s16) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[SHL2]], [[C4]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[SHL3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[SHL2]], [[UMIN1]] - ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[ADD1]], [[C3]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL2]], [[SHL3]] + ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT1]], [[C3]](s16) + ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) - ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C5]] + ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] ; GFX9: [[COPY3:%[0-9]+]]:_(s16) = COPY [[LSHR7]](s16) - ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C5]] + ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY3]], [[C4]] ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16) ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]] ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -286,22 +264,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC1]] + ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UADDSAT]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: uaddsat_s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UADDSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -364,15 +336,10 @@ body: | ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC2]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C1]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[TRUNC3]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[UMIN1]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC2]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC1]], [[TRUNC3]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) @@ -380,12 +347,8 @@ body: | ; GFX9-LABEL: name: uaddsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[COPY1]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[ADD]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[UADDSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_UADDSAT %0, %1 @@ -456,9 +419,11 @@ body: | ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX6: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX6: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX6: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX6: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX8-LABEL: name: uaddsat_v3s16 ; GFX8: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX8: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) @@ -482,31 +447,26 @@ body: | ; GFX8: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX8: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC3]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C1]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[TRUNC4]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[UMIN1]] - ; GFX8: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[XOR2]], [[TRUNC5]] - ; GFX8: [[ADD2:%[0-9]+]]:_(s16) = G_ADD [[TRUNC2]], [[UMIN2]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC3]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC1]], [[TRUNC4]] + ; GFX8: [[UADDSAT2:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC2]], [[TRUNC5]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ADD2]](s16) - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT2]](s16) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX8: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX8: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX8: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX8: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX8: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: uaddsat_v3s16 ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) @@ -536,20 +496,15 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[UMIN]] - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UADDSAT]](<2 x s16>), [[UADDSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 %3:_(<3 x s16>) = G_UADDSAT %1, %2 @@ -650,26 +605,17 @@ body: | ; GFX8: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX8: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC4]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C1]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[TRUNC5]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[UMIN1]] - ; GFX8: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[XOR2]], [[TRUNC6]] - ; GFX8: [[ADD2:%[0-9]+]]:_(s16) = G_ADD [[TRUNC2]], [[UMIN2]] - ; GFX8: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC3]], [[C1]] - ; GFX8: [[UMIN3:%[0-9]+]]:_(s16) = G_UMIN [[XOR3]], [[TRUNC7]] - ; GFX8: [[ADD3:%[0-9]+]]:_(s16) = G_ADD [[TRUNC3]], [[UMIN3]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC4]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC1]], [[TRUNC5]] + ; GFX8: [[UADDSAT2:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC2]], [[TRUNC6]] + ; GFX8: [[UADDSAT3:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC3]], [[TRUNC7]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ADD2]](s16) - ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ADD3]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT2]](s16) + ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) @@ -680,16 +626,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[UV2]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV]], [[UMIN]] - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV1]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR1]], [[UV3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[UV]], [[UV2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UADDSAT]](<2 x s16>), [[UADDSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -714,19 +653,13 @@ body: | ; GFX8-LABEL: name: uaddsat_s32 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]] - ; GFX8: $vgpr0 = COPY [[ADD]](s32) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX8: $vgpr0 = COPY [[UADDSAT]](s32) ; GFX9-LABEL: name: uaddsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[ADD]](s32) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[UADDSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_UADDSAT %0, %1 @@ -758,28 +691,18 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[C]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[UV2]] - ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV1]], [[C]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[UV3]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UMIN1]] - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[UV]], [[UV2]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s32) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UADDSAT]](s32), [[UADDSAT1]](s32) ; GFX8: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: uaddsat_v2s32 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[C]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[UV2]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[UMIN]] - ; GFX9: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV1]], [[C]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[UV3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UMIN1]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[UV]], [[UV2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(s32) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UADDSAT]](s32), [[UADDSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir index 7a1bf1b4467b4..da102794a474c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir @@ -351,21 +351,24 @@ body: | ; SI-LABEL: name: test_umax_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) @@ -396,31 +399,34 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_umax_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC3]] @@ -436,24 +442,27 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_umax_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV]], [[UV2]] - ; GFX9: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV1]], [[UV3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV8]], [[UV10]] + ; GFX9: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV9]], [[UV11]] + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; GFX9: S_NOP 0, implicit [[UV12]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_UMAX %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir index de480c00783b5..d5cddf28000c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir @@ -351,21 +351,24 @@ body: | ; SI-LABEL: name: test_umin_v3s16 ; SI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; SI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; SI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; SI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; SI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; SI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) @@ -396,31 +399,34 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL1]] ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; SI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; SI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; SI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; SI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; SI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; VI-LABEL: name: test_umin_v3s16 ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; VI: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; VI: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 - ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 - ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; VI: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; VI: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT1]](<3 x s16>), 0 - ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; VI: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV11]](<2 x s16>) ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; VI: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC3]] @@ -436,24 +442,27 @@ body: | ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) - ; VI: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; VI: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; VI: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) + ; VI: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS2]](<4 x s16>), 0 + ; VI: S_NOP 0, implicit [[EXTRACT]](<3 x s16>) ; GFX9-LABEL: name: test_umin_v3s16 ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 - ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>) - ; GFX9: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX9: S_NOP 0, implicit [[EXTRACT2]](<3 x s16>) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 + ; GFX9: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; GFX9: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<4 x s16>) + ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV8]], [[UV10]] + ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV9]], [[UV11]] + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>) + ; GFX9: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; GFX9: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>), [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; GFX9: S_NOP 0, implicit [[UV12]](<3 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_UMIN %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir index f40ed9796923c..11b9da883008e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir @@ -60,13 +60,15 @@ body: | bb.0: ; CHECK-LABEL: name: test_unmerge_s16_v3s16 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) - ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; CHECK: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INSERT]](<4 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir index 356bb38456ea6..6b5d9c8cb5c59 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir @@ -30,9 +30,8 @@ body: | ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s7 @@ -43,9 +42,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -84,9 +82,8 @@ body: | ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s8 @@ -97,9 +94,8 @@ body: | ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -172,16 +168,14 @@ body: | ; GFX8: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C3]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C3]](s16) ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX8: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX8: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SUB1]], [[C3]](s16) + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL2]], [[SHL3]] + ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT1]], [[C3]](s16) ; GFX8: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX8: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) ; GFX8: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] @@ -208,16 +202,14 @@ body: | ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C3]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SUB1]], [[C3]](s16) + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL2]], [[SHL3]] + ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT1]], [[C3]](s16) ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] @@ -263,18 +255,16 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]] + ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -334,12 +324,10 @@ body: | ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC2]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC2]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC3]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) @@ -347,9 +335,8 @@ body: | ; GFX9-LABEL: name: usubsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[SUB]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[USUBSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_USUBSAT %0, %1 @@ -416,9 +403,11 @@ body: | ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX6: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX6: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX6: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX6: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX6: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX8-LABEL: name: usubsat_v3s16 ; GFX8: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX8: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) @@ -442,27 +431,26 @@ body: | ; GFX8: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX8: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC3]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC4]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC2]], [[TRUNC5]] - ; GFX8: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC2]], [[UMIN2]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC3]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC4]] + ; GFX8: [[USUBSAT2:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC2]], [[TRUNC5]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[SUB2]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT2]](s16) ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX8: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX8: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX8: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX8: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF1]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX8: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX8: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) ; GFX9-LABEL: name: usubsat_v3s16 ; GFX9: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) @@ -492,15 +480,15 @@ body: | ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SUB]](<2 x s16>), [[SUB1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[USUBSAT]](<2 x s16>), [[USUBSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 - ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF3]](<4 x s16>), [[DEF]](<4 x s16>), [[DEF]](<4 x s16>) + ; GFX9: [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>), [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[UV6]](<3 x s16>) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 %3:_(<3 x s16>) = G_USUBSAT %1, %2 @@ -596,21 +584,17 @@ body: | ; GFX8: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX8: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC4]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC5]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC2]], [[TRUNC6]] - ; GFX8: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC2]], [[UMIN2]] - ; GFX8: [[UMIN3:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC3]], [[TRUNC7]] - ; GFX8: [[SUB3:%[0-9]+]]:_(s16) = G_SUB [[TRUNC3]], [[UMIN3]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC4]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC5]] + ; GFX8: [[USUBSAT2:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC2]], [[TRUNC6]] + ; GFX8: [[USUBSAT3:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC3]], [[TRUNC7]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[SUB2]](s16) - ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[SUB3]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT2]](s16) + ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) @@ -621,11 +605,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SUB]](<2 x s16>), [[SUB1]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[UV]], [[UV2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[USUBSAT]](<2 x s16>), [[USUBSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -648,15 +630,13 @@ body: | ; GFX8-LABEL: name: usubsat_s32 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]] - ; GFX8: $vgpr0 = COPY [[SUB]](s32) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX8: $vgpr0 = COPY [[USUBSAT]](s32) ; GFX9-LABEL: name: usubsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[SUB]](s32) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[USUBSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_USUBSAT %0, %1 @@ -685,22 +665,18 @@ body: | ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[UMIN1]] - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[UV]], [[UV2]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s32) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[USUBSAT]](s32), [[USUBSAT1]](s32) ; GFX8: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: usubsat_v2s32 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[UMIN1]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[UV]], [[UV2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(s32) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[USUBSAT]](s32), [[USUBSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir index 110ed17400d26..3c3987a61cf4f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir @@ -361,15 +361,18 @@ body: | ; CHECK-LABEL: name: test_xor_v3s16 ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; CHECK: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF1]](<4 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>), [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s16>) ; CHECK: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<3 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[DEF2]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>), [[UV6:%[0-9]+]]:_(<3 x s16>), [[UV7:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<12 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV4]](<3 x s16>), 0 ; CHECK: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[INSERT]], [[INSERT1]] - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[XOR]](<4 x s16>), 0 - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF2]], [[EXTRACT2]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[XOR]](<4 x s16>), [[DEF1]](<4 x s16>), [[DEF1]](<4 x s16>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s16>), [[UV9:%[0-9]+]]:_(<3 x s16>), [[UV10:%[0-9]+]]:_(<3 x s16>), [[UV11:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV8]](<3 x s16>), 0 ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT2]](<4 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF @@ -403,27 +406,32 @@ body: | ; CHECK-LABEL: name: test_xor_v5s16 ; CHECK: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[DEF]](<6 x s16>), 0 ; CHECK: [[DEF1:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[DEF1]](<6 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[DEF]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>), [[UV2:%[0-9]+]]:_(<5 x s16>), [[UV3:%[0-9]+]]:_(<5 x s16>), [[UV4:%[0-9]+]]:_(<5 x s16>), [[UV5:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<30 x s16>) ; CHECK: [[DEF2:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT]](<5 x s16>), 0 - ; CHECK: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF2]], [[EXTRACT1]](<5 x s16>), 0 - ; CHECK: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT]](<6 x s16>) - ; CHECK: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[DEF2]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV6:%[0-9]+]]:_(<5 x s16>), [[UV7:%[0-9]+]]:_(<5 x s16>), [[UV8:%[0-9]+]]:_(<5 x s16>), [[UV9:%[0-9]+]]:_(<5 x s16>), [[UV10:%[0-9]+]]:_(<5 x s16>), [[UV11:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<30 x s16>) + ; CHECK: [[INSERT:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[UV]](<5 x s16>), 0 + ; CHECK: [[INSERT1:%[0-9]+]]:_(<6 x s16>) = G_INSERT [[DEF1]], [[UV6]](<5 x s16>), 0 + ; CHECK: [[UV12:%[0-9]+]]:_(<3 x s16>), [[UV13:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT]](<6 x s16>) + ; CHECK: [[UV14:%[0-9]+]]:_(<3 x s16>), [[UV15:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[INSERT1]](<6 x s16>) ; CHECK: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV]](<3 x s16>), 0 - ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV2]](<3 x s16>), 0 + ; CHECK: [[INSERT2:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV12]](<3 x s16>), 0 + ; CHECK: [[INSERT3:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV14]](<3 x s16>), 0 ; CHECK: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[INSERT2]], [[INSERT3]] - ; CHECK: [[EXTRACT2:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[XOR]](<4 x s16>), 0 - ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV1]](<3 x s16>), 0 - ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV3]](<3 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS2:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[XOR]](<4 x s16>), [[DEF3]](<4 x s16>), [[DEF3]](<4 x s16>) + ; CHECK: [[UV16:%[0-9]+]]:_(<3 x s16>), [[UV17:%[0-9]+]]:_(<3 x s16>), [[UV18:%[0-9]+]]:_(<3 x s16>), [[UV19:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<12 x s16>) + ; CHECK: [[INSERT4:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV13]](<3 x s16>), 0 + ; CHECK: [[INSERT5:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF3]], [[UV15]](<3 x s16>), 0 ; CHECK: [[XOR1:%[0-9]+]]:_(<4 x s16>) = G_XOR [[INSERT4]], [[INSERT5]] - ; CHECK: [[EXTRACT3:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[XOR1]](<4 x s16>), 0 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT2]](<3 x s16>), [[EXTRACT3]](<3 x s16>) - ; CHECK: [[EXTRACT4:%[0-9]+]]:_(<5 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 + ; CHECK: [[CONCAT_VECTORS3:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[XOR1]](<4 x s16>), [[DEF3]](<4 x s16>), [[DEF3]](<4 x s16>) + ; CHECK: [[UV20:%[0-9]+]]:_(<3 x s16>), [[UV21:%[0-9]+]]:_(<3 x s16>), [[UV22:%[0-9]+]]:_(<3 x s16>), [[UV23:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<12 x s16>) + ; CHECK: [[CONCAT_VECTORS4:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV16]](<3 x s16>), [[UV20]](<3 x s16>) + ; CHECK: [[CONCAT_VECTORS5:%[0-9]+]]:_(<30 x s16>) = G_CONCAT_VECTORS [[CONCAT_VECTORS4]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>), [[DEF1]](<6 x s16>) + ; CHECK: [[UV24:%[0-9]+]]:_(<5 x s16>), [[UV25:%[0-9]+]]:_(<5 x s16>), [[UV26:%[0-9]+]]:_(<5 x s16>), [[UV27:%[0-9]+]]:_(<5 x s16>), [[UV28:%[0-9]+]]:_(<5 x s16>), [[UV29:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS5]](<30 x s16>) ; CHECK: [[DEF4:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[INSERT6:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF4]], [[EXTRACT4]](<5 x s16>), 0 + ; CHECK: [[INSERT6:%[0-9]+]]:_(<8 x s16>) = G_INSERT [[DEF4]], [[UV24]](<5 x s16>), 0 ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT6]](<8 x s16>) %0:_(<5 x s16>) = G_IMPLICIT_DEF %1:_(<5 x s16>) = G_IMPLICIT_DEF @@ -463,8 +471,9 @@ body: | ; CHECK: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[ANYEXT6]], [[ANYEXT7]] ; CHECK: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[XOR3]](s32) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(<3 x s8>) = G_EXTRACT [[BUILD_VECTOR]](<4 x s8>), 0 - ; CHECK: [[ANYEXT8:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[EXTRACT]](<3 x s8>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s8>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<4 x s8>), [[DEF2]](<4 x s8>), [[DEF2]](<4 x s8>) + ; CHECK: [[UV8:%[0-9]+]]:_(<3 x s8>), [[UV9:%[0-9]+]]:_(<3 x s8>), [[UV10:%[0-9]+]]:_(<3 x s8>), [[UV11:%[0-9]+]]:_(<3 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s8>) + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(<3 x s32>) = G_ANYEXT [[UV8]](<3 x s8>) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[ANYEXT8]](<3 x s32>) %0:_(<3 x s8>) = G_IMPLICIT_DEF %1:_(<3 x s8>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll new file mode 100644 index 0000000000000..9bc56d124a9cd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN + +define i32 @global_atomic_csub(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %ptr, i32 %data) + ret i32 %ret +} + +define i32 @global_atomic_csub_offset(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: s_movk_i32 s4, 0x1000 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s5 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v3 +; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) + ret i32 %ret +} + +define void @global_atomic_csub_nortn(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %ptr, i32 %data) + ret void +} + +define void @global_atomic_csub_offset_nortn(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_offset_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: s_movk_i32 s4, 0x1000 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s5 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v3 +; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) + ret void +} + +define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_sgpr_base_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x8 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) + store i32 %ret, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_sgpr_base_offset_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) + ret void +} + +declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #1 + +attributes #0 = { nounwind willreturn } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll index e3fd488af5e0d..f44c91ff6710d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -115,10 +115,8 @@ define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_r } ; ALL-LABEL: {{^}}func_kernarg_segment_ptr: -; ALL: s_mov_b32 [[S_LO:s[0-9]+]], 0{{$}} -; ALL: s_mov_b32 [[S_HI:s[0-9]+]], 0{{$}} -; ALL: v_mov_b32_e32 v0, [[S_LO]]{{$}} -; ALL: v_mov_b32_e32 v1, [[S_HI]]{{$}} +; ALL: v_mov_b32_e32 v0, 0{{$}} +; ALL: v_mov_b32_e32 v1, 0{{$}} define i8 addrspace(4)* @func_kernarg_segment_ptr() { %ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ret i8 addrspace(4)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index 66425c27a19fe..805aa301f9383 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -172,19 +172,24 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) - ; GFX6: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0_sub1_sub2 - ; GFX6: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 - ; GFX6: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 - ; GFX6: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX6: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX6: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) + ; GFX6: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 + ; GFX6: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 + ; GFX6: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 + ; GFX6: [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 + ; GFX6: [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 + ; GFX6: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 + ; GFX6: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 + ; GFX6: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 + ; GFX6: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX6: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX6: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX6: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX6: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX6: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; GFX6: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; GFX7-LABEL: name: s_buffer_load_v3i32 @@ -196,19 +201,24 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) - ; GFX7: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0_sub1_sub2 - ; GFX7: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 - ; GFX7: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 - ; GFX7: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX7: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX7: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) + ; GFX7: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 + ; GFX7: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 + ; GFX7: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 + ; GFX7: [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 + ; GFX7: [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 + ; GFX7: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 + ; GFX7: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 + ; GFX7: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 + ; GFX7: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX7: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX7: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX7: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX7: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX7: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; GFX7: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; GFX8-LABEL: name: s_buffer_load_v3i32 @@ -220,19 +230,24 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) - ; GFX8: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR]].sub0_sub1_sub2 - ; GFX8: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 - ; GFX8: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 - ; GFX8: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX8: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8: [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4) + ; GFX8: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11 + ; GFX8: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 + ; GFX8: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 + ; GFX8: [[COPY7:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 + ; GFX8: [[COPY8:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 + ; GFX8: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 + ; GFX8: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 + ; GFX8: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 + ; GFX8: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX8: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX8: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec ; GFX8: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX8: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] - ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX8: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec ; GFX8: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -973,7 +988,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -987,7 +1002,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX7: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1001,7 +1016,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1020,7 +1035,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1047,7 +1062,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1066,7 +1081,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967288 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1093,7 +1108,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967288 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -8 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1112,7 +1127,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1139,7 +1154,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1342,7 +1357,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4293918720 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1369,7 +1384,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4293918720 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1048576 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1433,7 +1448,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1460,7 +1475,7 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -524288 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec @@ -1580,13 +1595,20 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1_sub2 - ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 - ; GFX6: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub1 - ; GFX6: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub2 - ; GFX6: $vgpr0 = COPY [[COPY6]] - ; GFX6: $vgpr1 = COPY [[COPY7]] - ; GFX6: $vgpr2 = COPY [[COPY8]] + ; GFX6: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GFX6: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] + ; GFX6: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 + ; GFX6: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 + ; GFX6: [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 + ; GFX6: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 + ; GFX6: [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 + ; GFX6: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 + ; GFX6: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 + ; GFX6: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 + ; GFX6: $vgpr0 = COPY [[COPY11]] + ; GFX6: $vgpr1 = COPY [[COPY12]] + ; GFX6: $vgpr2 = COPY [[COPY13]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; GFX7-LABEL: name: s_buffer_load_v3f32_vgpr_offset ; GFX7: bb.1 (%ir-block.0): @@ -1599,13 +1621,20 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1_sub2 - ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 - ; GFX7: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub1 - ; GFX7: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub2 - ; GFX7: $vgpr0 = COPY [[COPY6]] - ; GFX7: $vgpr1 = COPY [[COPY7]] - ; GFX7: $vgpr2 = COPY [[COPY8]] + ; GFX7: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GFX7: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] + ; GFX7: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] + ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 + ; GFX7: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 + ; GFX7: [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 + ; GFX7: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 + ; GFX7: [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 + ; GFX7: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 + ; GFX7: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 + ; GFX7: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 + ; GFX7: $vgpr0 = COPY [[COPY11]] + ; GFX7: $vgpr1 = COPY [[COPY12]] + ; GFX7: $vgpr2 = COPY [[COPY13]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; GFX8-LABEL: name: s_buffer_load_v3f32_vgpr_offset ; GFX8: bb.1 (%ir-block.0): @@ -1618,13 +1647,20 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1_sub2 - ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 - ; GFX8: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub1 - ; GFX8: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub2 - ; GFX8: $vgpr0 = COPY [[COPY6]] - ; GFX8: $vgpr1 = COPY [[COPY7]] - ; GFX8: $vgpr2 = COPY [[COPY8]] + ; GFX8: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; GFX8: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] + ; GFX8: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] + ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[COPY5]], %subreg.sub4_sub5_sub6_sub7, [[COPY6]], %subreg.sub8_sub9_sub10_sub11 + ; GFX8: [[COPY7:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub0_sub1_sub2 + ; GFX8: [[COPY8:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub3_sub4_sub5 + ; GFX8: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub6_sub7_sub8 + ; GFX8: [[COPY10:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]].sub9_sub10_sub11 + ; GFX8: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub0 + ; GFX8: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub1 + ; GFX8: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]].sub2 + ; GFX8: $vgpr0 = COPY [[COPY11]] + ; GFX8: $vgpr1 = COPY [[COPY12]] + ; GFX8: $vgpr2 = COPY [[COPY13]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x float> %val diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 7d116f8e8925f..e5d26476e9424 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -414,8 +414,8 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) ; GFX6-LABEL: bfe_i32_constant_fold_test_4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_bfe_i32 s3, -1, 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_i32 s2, -1, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -617,8 +617,8 @@ define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) ; GFX6-LABEL: bfe_i32_constant_fold_test_16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_bfe_i32 s3, -1, 0x70001 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_i32 s2, -1, 0x70001 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll index fdcf0f1515f91..172656f08aefb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -82,24 +82,21 @@ define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> %b, i32 %c, i1 false) ret i32 %r @@ -109,24 +106,21 @@ define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -136,29 +130,21 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -168,29 +154,21 @@ define i32 @v_sdot2_inline_literal_a_b_c() { ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 +; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 +; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> , i32 8, i1 false) ret i32 %r diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll index a5737e8233af3..ab3fbc03e81d5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -694,8 +694,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) ; GFX6-LABEL: bfe_u32_constant_fold_test_4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_bfe_u32 s3, -1, 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_u32 s2, -1, 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -897,8 +897,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) ; GFX6-LABEL: bfe_u32_constant_fold_test_16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_bfe_u32 s3, -1, 0x70001 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_bfe_u32 s2, -1, 0x70001 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -950,22 +950,22 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s8, 63 -; GFX6-NEXT: s_bfe_u32 s9, s8, 0x20002 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { %src = load i32, i32 addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll index d285ee132cc21..976536c728838 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -82,24 +82,21 @@ define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> %b, i32 %c, i1 false) ret i32 %r @@ -109,24 +106,21 @@ define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -136,29 +130,21 @@ define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -168,29 +154,21 @@ define i32 @v_udot2_inline_literal_a_b_c() { ; GFX906-LABEL: v_udot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 +; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 +; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> , i32 8, i1 false) ret i32 %r diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index af8b4f0f9e5ae..6dceaf2e22fa8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -17,57 +17,56 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) { ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 -; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off offset:-7 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v3 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v7, v3, v8 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v10 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v11 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v4, v5 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v6, v7 -; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 +; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v3 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v3 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, v3, v7 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v10, v3, v11 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v8, v2 +; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: @@ -156,31 +155,30 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) { ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 -; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v5, v3, v6 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4 -; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 +; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, v3, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, v3, v2 +; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: @@ -194,35 +192,36 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) { ; ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX7-NOUNALIGNED: ; %bb.0: -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v0 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v6, v5 -; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load } @@ -399,97 +398,101 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)* ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v17, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v16, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v19, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v18, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v21, v[12:13], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[14:15], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[16:17], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[18:19], off -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[12:13], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v2 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v21, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v14, v5 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v4, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v11, v5 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v15, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v17, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v18, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v19, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[8:9], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[12:13], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[14:15], off +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s1 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, s1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v18 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, s0, v19 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v16, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v11, v5 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v8, v5, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v7 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v3, v5 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v4, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v12, v5, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: @@ -585,52 +588,52 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)* ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v10, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v10, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v6, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v5, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v6, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: @@ -648,37 +651,35 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)* ; ; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-NOUNALIGNED: ; %bb.0: -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 -; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 06bf7f7949309..23398f8ecf6e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -8,9 +8,9 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-LABEL: localize_constants: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s0, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_xor_b32 s1, s1, 1 +; GFX9-NEXT: s_xor_b32 s1, s1, -1 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 BB0_2 @@ -83,9 +83,9 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-LABEL: localize_globals: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s0, 1 +; GFX9-NEXT: s_mov_b32 s0, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_xor_b32 s1, s1, 1 +; GFX9-NEXT: s_xor_b32 s1, s1, -1 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 BB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 9d82396bbc364..ea2631cbcb294 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -533,8 +533,7 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_lshr_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s4, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index b8e69433913ea..670c9898c2798 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -56,15 +56,17 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4) - ; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 - ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:sgpr(<3 x s32>), [[UV1:%[0-9]+]]:sgpr(<3 x s32>), [[UV2:%[0-9]+]]:sgpr(<3 x s32>), [[UV3:%[0-9]+]]:sgpr(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; CHECK: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[UV]](<3 x s32>) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; CHECK: $sgpr0 = COPY [[INT]](s32) - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) ; CHECK: $sgpr1 = COPY [[INT1]](s32) - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) ; CHECK: $sgpr2 = COPY [[INT2]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 @@ -232,11 +234,15 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) - ; CHECK: [[EXTRACT:%[0-9]+]]:vgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_BUFFER_LOAD]](<4 x s32>), 0 - ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) - ; CHECK: $vgpr0 = COPY [[UV]](s32) - ; CHECK: $vgpr1 = COPY [[UV1]](s32) - ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[DEF]](<4 x s32>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[COPY5]](<4 x s32>), [[COPY6]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<3 x s32>), [[UV1:%[0-9]+]]:vgpr(<3 x s32>), [[UV2:%[0-9]+]]:vgpr(<3 x s32>), [[UV3:%[0-9]+]]:vgpr(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>) + ; CHECK: [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[UV]](<3 x s32>) + ; CHECK: $vgpr0 = COPY [[UV4]](s32) + ; CHECK: $vgpr1 = COPY [[UV5]](s32) + ; CHECK: $vgpr2 = COPY [[UV6]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x float> %val diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll new file mode 100644 index 0000000000000..09eda4be61975 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -0,0 +1,566 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +define float @v_roundeven_f32(float %x) { +; GFX6-LABEL: v_roundeven_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call float @llvm.roundeven.f32(float %x) + ret float %roundeven +} + +define <2 x float> @v_roundeven_v2f32(<2 x float> %x) { +; GFX6-LABEL: v_roundeven_v2f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v2f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_rndne_f32_e32 v1, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_rndne_f32_e32 v1, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %x) + ret <2 x float> %roundeven +} + +define <3 x float> @v_roundeven_v3f32(<3 x float> %x) { +; GFX6-LABEL: v_roundeven_v3f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v2, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v3f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v2, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v3f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_rndne_f32_e32 v1, v1 +; GFX8-NEXT: v_rndne_f32_e32 v2, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_rndne_f32_e32 v1, v1 +; GFX9-NEXT: v_rndne_f32_e32 v2, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %x) + ret <3 x float> %roundeven +} + +define <4 x float> @v_roundeven_v4f32(<4 x float> %x) { +; GFX6-LABEL: v_roundeven_v4f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v2, v2 +; GFX6-NEXT: v_rndne_f32_e32 v3, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v4f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v2, v2 +; GFX7-NEXT: v_rndne_f32_e32 v3, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_rndne_f32_e32 v1, v1 +; GFX8-NEXT: v_rndne_f32_e32 v2, v2 +; GFX8-NEXT: v_rndne_f32_e32 v3, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_rndne_f32_e32 v1, v1 +; GFX9-NEXT: v_rndne_f32_e32 v2, v2 +; GFX9-NEXT: v_rndne_f32_e32 v3, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) + ret <4 x float> %roundeven +} + +define half @v_roundeven_f16(half %x) { +; GFX6-LABEL: v_roundeven_f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f16_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f16_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call half @llvm.roundeven.f16(half %x) + ret half %roundeven +} + +define <2 x half> @v_roundeven_v2f16(<2 x half> %x) { +; GFX6-LABEL: v_roundeven_v2f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f16_e32 v1, v0 +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f16_e32 v1, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x) + ret <2 x half> %roundeven +} + +define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) { +; GFX6-LABEL: v_roundeven_v2f16_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_rndne_f32_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v2f16_fneg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_rndne_f32_e32 v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v2f16_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_rndne_f16_e32 v1, v0 +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v2f16_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX9-NEXT: v_rndne_f16_e32 v1, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %x.fneg = fneg <2 x half> %x + %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg) + ret <2 x half> %roundeven +} + +define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { +; GFX6-LABEL: v_roundeven_v4f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v2, v2 +; GFX6-NEXT: v_rndne_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v4f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v2, v2 +; GFX7-NEXT: v_rndne_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f16_e32 v2, v0 +; GFX8-NEXT: v_rndne_f16_e32 v3, v1 +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f16_e32 v2, v0 +; GFX9-NEXT: v_rndne_f16_e32 v3, v1 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) + ret <4 x half> %roundeven +} + + +define float @v_roundeven_f32_fabs(float %x) { +; GFX6-LABEL: v_roundeven_f32_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f32_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f32_fabs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f32_fabs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call float @llvm.fabs.f32(float %x) + %roundeven = call float @llvm.roundeven.f32(float %fabs.x) + ret float %roundeven +} + +define amdgpu_ps float @s_roundeven_f32(float inreg %x) { +; GFX6-LABEL: s_roundeven_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_rndne_f32_e32 v0, s0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: s_roundeven_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_rndne_f32_e32 v0, s0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_roundeven_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_rndne_f32_e32 v0, s0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_roundeven_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_rndne_f32_e32 v0, s0 +; GFX9-NEXT: ; return to shader part epilog + %roundeven = call float @llvm.roundeven.f32(float %x) + ret float %roundeven +} + +define float @v_roundeven_f32_fneg(float %x) { +; GFX6-LABEL: v_roundeven_f32_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f32_fneg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f32_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f32_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg float %x + %roundeven = call float @llvm.roundeven.f32(float %neg.x) + ret float %roundeven +} + +define double @v_roundeven_f64(double %x) { +; GFX6-LABEL: v_roundeven_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_or_b32_e32 v3, 0x43300000, v3 +; GFX6-NEXT: v_add_f64 v[4:5], v[0:1], v[2:3] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call double @llvm.roundeven.f64(double %x) + ret double %roundeven +} + +define double @v_roundeven_f64_fneg(double %x) { +; GFX6-LABEL: v_roundeven_f64_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, 0 +; GFX6-NEXT: v_or_b32_e32 v4, 0x43300000, v4 +; GFX6-NEXT: v_add_f64 v[5:6], -v[0:1], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; GFX6-NEXT: v_add_f64 v[3:4], v[5:6], -v[3:4] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[1:2]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f64_fneg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f64_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f64_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg double %x + %roundeven = call double @llvm.roundeven.f64(double %neg.x) + ret double %roundeven +} + +define <2 x double> @v_roundeven_v2f64(<2 x double> %x) { +; GFX6-LABEL: v_roundeven_v2f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_brev_b32 s6, 1 +; GFX6-NEXT: s_mov_b32 s7, 0x43300000 +; GFX6-NEXT: v_and_b32_e32 v5, s6, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, 0 +; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 +; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX6-NEXT: v_and_b32_e32 v5, s6, v3 +; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 +; GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v2f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX7-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v2f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX8-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX9-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) + ret <2 x double> %roundeven +} + +declare half @llvm.roundeven.f16(half) #0 +declare <2 x half> @llvm.roundeven.v2f16(<2 x half>) #0 +declare <4 x half> @llvm.roundeven.v4f16(<4 x half>) #0 + +declare float @llvm.roundeven.f32(float) #0 +declare <2 x float> @llvm.roundeven.v2f32(<2 x float>) #0 +declare <3 x float> @llvm.roundeven.v3f32(<3 x float>) #0 +declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) #0 + +declare double @llvm.roundeven.f64(double) #0 +declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) #0 + +declare half @llvm.fabs.f16(half) #0 +declare float @llvm.fabs.f32(float) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 80ff9bb2b5752..7b88123b2c9b9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -39,14 +39,8 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 -; GFX9-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i16_e32 v2, 0, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_sub_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_max_i16_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -57,13 +51,7 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i16_e64 v2, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v3, v0, 0 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, 0x8000, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, 0x7fff, v3 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) @@ -101,7 +89,7 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_cmp_lt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s1 @@ -118,54 +106,23 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-LABEL: s_saddsat_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, 0 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s5, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s3, s4 -; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s5 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -206,14 +163,8 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i16_e32 v2, 0, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_max_i16_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -224,13 +175,7 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i16_e64 v2, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v3, v0, 0 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, 0x8000, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, 0x7fff, v3 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) @@ -268,7 +213,7 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_cmp_lt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s1 @@ -285,54 +230,23 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-LABEL: s_saddsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, 0 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s5, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s3, s4 -; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s3, 0x8000, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s5 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -344,7 +258,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -379,7 +293,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -408,26 +322,12 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v5, 0, v0 ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v5, s5, v5 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v4, 0, v0 -; GFX9-NEXT: v_sub_u16_e32 v4, s4, v4 -; GFX9-NEXT: v_max_i16_e32 v1, v5, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v4 -; GFX9-NEXT: v_min_i16_e32 v4, 0, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, 0, v2 -; GFX9-NEXT: v_sub_u16_e32 v4, s5, v4 -; GFX9-NEXT: v_sub_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_max_i16_e32 v3, v4, v3 -; GFX9-NEXT: v_min_i16_e32 v1, v3, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_add_i16 v1, v2, v3 clamp ; GFX9-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -438,31 +338,17 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s5, 0x8000 -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_min_i16_e64 v4, v2, 0 -; GFX10-NEXT: v_min_i16_e64 v5, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v6, v2, 0 -; GFX10-NEXT: v_max_i16_e64 v7, v0, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v4, s5, v4 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, s5, v5 -; GFX10-NEXT: v_sub_nc_u16_e64 v6, s4, v6 -; GFX10-NEXT: v_sub_nc_u16_e64 v7, s4, v7 +; GFX10-NEXT: v_lshlrev_b16_e64 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_max_i16_e64 v1, v4, v1 -; GFX10-NEXT: v_max_i16_e64 v10, v5, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i16_e64 v1, v1, v6 -; GFX10-NEXT: v_min_i16_e64 v3, v10, v7 -; GFX10-NEXT: v_add_nc_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v3 -; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_nc_i16 v1, v2, v1 clamp +; GFX10-NEXT: v_add_nc_i16 v0, v0, v3 clamp +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -483,7 +369,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_cselect_b32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, 0 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, s1 @@ -527,7 +413,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_cselect_b32 s9, s7, s8 ; GFX8-NEXT: s_sub_i32 s9, s5, s9 ; GFX8-NEXT: s_cmp_lt_i32 s7, s8 -; GFX8-NEXT: s_mov_b32 s6, 0x8000 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: s_cselect_b32 s7, s7, s8 ; GFX8-NEXT: s_sub_i32 s7, s6, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 @@ -571,112 +457,40 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-LABEL: s_saddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_sext_i32_i16 s8, 0 -; GFX9-NEXT: s_cmp_gt_i32 s7, s8 -; GFX9-NEXT: s_movk_i32 s5, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s7, s8 -; GFX9-NEXT: s_sub_i32 s9, s5, s9 -; GFX9-NEXT: s_cmp_lt_i32 s7, s8 -; GFX9-NEXT: s_mov_b32 s6, 0x8000 -; GFX9-NEXT: s_cselect_b32 s7, s7, s8 -; GFX9-NEXT: s_sub_i32 s7, s6, s7 -; GFX9-NEXT: s_sext_i32_i16 s7, s7 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s7, s1 -; GFX9-NEXT: s_cselect_b32 s1, s7, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s1, s1, s7 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s2, s3, s4 -; GFX9-NEXT: s_ashr_i32 s0, s0, s4 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s7, s3, s8 -; GFX9-NEXT: s_sub_i32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_gt_i32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_ashr_i32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: s_lshl_b32 s0, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_i16 v1, s0, v1 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v0, sext(v0), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s6, 0 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_bfe_u32 s3, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s6 -; GFX10-NEXT: s_mov_b32 s9, 0x8000 -; GFX10-NEXT: s_sub_i32 s8, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_add_nc_i16 v1, s2, s3 clamp +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_sub_i32 s5, s9, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s1 -; GFX10-NEXT: s_cselect_b32 s1, s5, s1 -; GFX10-NEXT: s_sext_i32_i16 s5, s8 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s5 -; GFX10-NEXT: s_cselect_b32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s4, s2 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cselect_b32 s5, s4, s6 -; GFX10-NEXT: s_sub_i32 s5, s7, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_sub_i32 s4, s9, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: s_cmp_gt_i32 s4, s1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s5 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s4 -; GFX10-NEXT: s_cselect_b32 s1, s1, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s3 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: s_ashr_i32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s0, s0, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -693,7 +507,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 @@ -726,7 +540,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_min_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 @@ -760,7 +574,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 @@ -815,52 +629,25 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v10, 0, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v10, s5, v10 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v8, 0, v0 -; GFX9-NEXT: v_sub_u16_e32 v8, s4, v8 -; GFX9-NEXT: v_max_i16_e32 v1, v10, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v8 -; GFX9-NEXT: v_min_i16_e32 v8, 0, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, 0, v2 -; GFX9-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX9-NEXT: v_sub_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_max_i16_e32 v5, v8, v5 -; GFX9-NEXT: v_min_i16_e32 v1, v5, v1 -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp +; GFX9-NEXT: v_add_i16 v1, v2, v5 clamp ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_min_i16_e32 v6, 0, v2 -; GFX9-NEXT: v_sub_u16_e32 v6, s5, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v5, 0, v2 -; GFX9-NEXT: v_sub_u16_e32 v5, v9, v5 -; GFX9-NEXT: v_max_i16_e32 v3, v6, v3 -; GFX9-NEXT: v_min_i16_e32 v3, v3, v5 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_min_i16_e32 v6, 0, v3 -; GFX9-NEXT: v_max_i16_e32 v5, 0, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX9-NEXT: v_sub_u16_e32 v6, 0x8000, v6 ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_sub_u16_e32 v5, v9, v5 -; GFX9-NEXT: v_max_i16_e32 v4, v6, v4 ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_min_i16_e32 v4, v4, v5 +; GFX9-NEXT: v_add_i16 v2, v2, v3 clamp +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_add_i16 v3, v3, v4 clamp ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v2), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -871,57 +658,30 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_mov_b32 s5, 16 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v8, v4, 0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s5, 0x8000 -; GFX10-NEXT: v_min_i16_e64 v9, v2, 0 -; GFX10-NEXT: v_lshlrev_b16_e64 v7, 8, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, s5, v8 -; GFX10-NEXT: v_max_i16_e64 v10, v4, 0 -; GFX10-NEXT: s_mov_b32 s6, 24 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v15, s5, v9 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_i16_e64 v11, v2, 0 -; GFX10-NEXT: v_max_i16_e64 v7, v8, v7 -; GFX10-NEXT: v_sub_nc_u16_e64 v10, s4, v10 -; GFX10-NEXT: v_max_i16_e64 v5, v15, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_sub_nc_u16_e64 v8, s4, v11 -; GFX10-NEXT: v_min_i16_e64 v11, v3, 0 -; GFX10-NEXT: v_min_i16_e64 v7, v7, v10 -; GFX10-NEXT: v_min_i16_e64 v10, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX10-NEXT: v_min_i16_e64 v5, v5, v8 -; GFX10-NEXT: v_sub_nc_u16_e64 v11, s5, v11 -; GFX10-NEXT: v_max_i16_e64 v8, v3, 0 -; GFX10-NEXT: v_sub_nc_u16_e64 v10, 0x8000, v10 -; GFX10-NEXT: v_max_i16_e64 v12, v0, 0 -; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, v5 -; GFX10-NEXT: v_max_i16_e64 v6, v11, v6 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, v9, v8 -; GFX10-NEXT: v_max_i16_e64 v1, v10, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v9, v12 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, v7 -; GFX10-NEXT: v_and_b32_sdwa v2, sext(v2), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v5, v6, v5 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v8 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v6, 8, v1 +; GFX10-NEXT: s_mov_b32 s5, 16 +; GFX10-NEXT: s_mov_b32 s4, 24 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_nc_i16 v2, v2, v3 clamp +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v2), s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_i16 v5, v5, v6 clamp +; GFX10-NEXT: v_add_nc_i16 v3, v4, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_ashrrev_i16_e64 v4, 8, v4 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_add_nc_u16_e64 v3, v3, v5 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, v4, s4, v2 -; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX10-NEXT: v_ashrrev_i16_e64 v4, 8, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v3), s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v4, s5, v2 +; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -946,7 +706,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_cselect_b32 s10, s0, 0 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, 0 ; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_cmp_gt_i32 s11, s1 @@ -1028,7 +788,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_cselect_b32 s13, s11, s12 ; GFX8-NEXT: s_sub_i32 s13, s9, s13 ; GFX8-NEXT: s_cmp_lt_i32 s11, s12 -; GFX8-NEXT: s_mov_b32 s10, 0x8000 +; GFX8-NEXT: s_movk_i32 s10, 0x8000 ; GFX8-NEXT: s_cselect_b32 s11, s11, s12 ; GFX8-NEXT: s_sub_i32 s11, s10, s11 ; GFX8-NEXT: s_sext_i32_i16 s11, s11 @@ -1118,212 +878,70 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-LABEL: s_saddsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_sext_i32_i16 s12, 0 -; GFX9-NEXT: s_cmp_gt_i32 s11, s12 -; GFX9-NEXT: s_movk_i32 s9, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s11, s12 -; GFX9-NEXT: s_sub_i32 s13, s9, s13 -; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_mov_b32 s10, 0x8000 -; GFX9-NEXT: s_cselect_b32 s11, s11, s12 -; GFX9-NEXT: s_sub_i32 s11, s10, s11 -; GFX9-NEXT: s_sext_i32_i16 s11, s11 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s11, s1 -; GFX9-NEXT: s_cselect_b32 s1, s11, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s1, s11 -; GFX9-NEXT: s_cselect_b32 s1, s1, s11 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s2, s8 -; GFX9-NEXT: s_lshl_b32 s2, s5, s8 -; GFX9-NEXT: s_ashr_i32 s0, s0, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s11, s5, s12 -; GFX9-NEXT: s_sub_i32 s11, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_gt_i32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s11 -; GFX9-NEXT: s_cmp_lt_i32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s5 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_lshl_b32 s2, s3, s8 -; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_ashr_i32 s1, s1, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s6, s5, s12 -; GFX9-NEXT: s_sub_i32 s6, s9, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_gt_i32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_lshl_b32 s3, s4, s8 -; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s6, s5, s12 -; GFX9-NEXT: s_sub_i32 s6, s9, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: s_cmp_gt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, s8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s2, s8 +; GFX9-NEXT: s_lshl_b32 s1, s6, s8 +; GFX9-NEXT: v_add_i16 v1, s0, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s3, s8 +; GFX9-NEXT: s_lshl_b32 s1, s7, s8 +; GFX9-NEXT: v_add_i16 v2, s0, v2 clamp +; GFX9-NEXT: s_lshl_b32 s0, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_i16 v3, s0, v3 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v2), s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, sext(v3), s0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s6, 8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_add_nc_i16 v1, s2, s6 clamp ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, 0 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s6 -; GFX10-NEXT: s_cmp_gt_i32 s9, s10 -; GFX10-NEXT: s_movk_i32 s11, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s10 -; GFX10-NEXT: s_mov_b32 s13, 0x8000 -; GFX10-NEXT: s_sub_i32 s12, s11, s12 -; GFX10-NEXT: s_cmp_lt_i32 s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s7, s1, s5 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_i16 v0, s0, s7 clamp +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_add_nc_i16 v2, s3, s0 clamp +; GFX10-NEXT: v_add_nc_i16 v3, s4, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_sub_i32 s9, s13, s9 -; GFX10-NEXT: s_sext_i32_i16 s9, s9 -; GFX10-NEXT: s_cmp_gt_i32 s9, s1 -; GFX10-NEXT: s_cselect_b32 s1, s9, s1 -; GFX10-NEXT: s_sext_i32_i16 s9, s12 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s9 -; GFX10-NEXT: s_cselect_b32 s1, s1, s9 -; GFX10-NEXT: s_lshl_b32 s5, s5, s6 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s6 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, s6 -; GFX10-NEXT: s_cmp_gt_i32 s2, s10 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s9, s2, s10 -; GFX10-NEXT: s_sub_i32 s9, s11, s9 -; GFX10-NEXT: s_cmp_lt_i32 s2, s10 -; GFX10-NEXT: s_cselect_b32 s2, s2, s10 -; GFX10-NEXT: s_sub_i32 s2, s13, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_gt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s9 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_lshl_b32 s2, s7, s6 -; GFX10-NEXT: s_ashr_i32 s1, s1, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s7, s5, s10 -; GFX10-NEXT: s_sub_i32 s7, s11, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s10 -; GFX10-NEXT: s_cselect_b32 s5, s5, s10 -; GFX10-NEXT: s_sub_i32 s5, s13, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s7 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s4, s4, s6 -; GFX10-NEXT: s_add_i32 s3, s3, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s4 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_lshl_b32 s2, s8, s6 -; GFX10-NEXT: s_ashr_i32 s3, s3, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s7, s5, s10 -; GFX10-NEXT: s_sub_i32 s7, s11, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s10 -; GFX10-NEXT: s_cselect_b32 s5, s5, s10 -; GFX10-NEXT: s_sub_i32 s5, s13, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s7 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_movk_i32 s7, 0xff -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_and_b32 s1, s1, s7 -; GFX10-NEXT: s_add_i32 s4, s4, s2 -; GFX10-NEXT: s_and_b32 s2, s3, s7 -; GFX10-NEXT: s_sext_i32_i16 s3, s4 -; GFX10-NEXT: s_and_b32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_ashr_i32 s3, s3, s6 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s7 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v2), s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -1368,14 +986,8 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, 0x7fffffff, v2 -; GFX9-NEXT: v_max_i32_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1386,13 +998,7 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v2, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v3, 0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x7fffffff, v3 -; GFX10-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) @@ -1439,39 +1045,22 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; ; GFX9-LABEL: s_saddsat_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, s0, 0 -; GFX9-NEXT: s_sub_i32 s2, 0x7fffffff, s2 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s3, s0, 0 -; GFX9-NEXT: s_sub_i32 s3, 0x80000000, s3 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, 0 -; GFX10-NEXT: s_sub_i32 s2, 0x7fffffff, s2 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s3, s0, 0 -; GFX10-NEXT: s_sub_i32 s3, 0x80000000, s3 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, 8 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -1505,27 +1094,15 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { ; GFX9-LABEL: v_saddsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, 0x7fffffff, v2 -; GFX9-NEXT: v_max_i32_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v2, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v3, 0, v0 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x7fffffff, v3 -; GFX10-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1579,33 +1156,16 @@ define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; ; GFX9-LABEL: s_saddsat_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, s0, 0 -; GFX9-NEXT: s_sub_i32 s2, 0x7fffffff, s2 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s3, s0, 0 -; GFX9-NEXT: s_sub_i32 s3, 0x80000000, s3 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, 0 -; GFX10-NEXT: s_sub_i32 s2, 0x7fffffff, s2 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s3, s0, 0 -; GFX10-NEXT: s_sub_i32 s3, 0x80000000, s3 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1640,29 +1200,13 @@ define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; ; GFX9-LABEL: saddsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s1, s0, 0 -; GFX9-NEXT: s_sub_i32 s1, 0x7fffffff, s1 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_cselect_b32 s2, s0, 0 -; GFX9-NEXT: s_sub_i32 s2, 0x80000000, s2 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX9-NEXT: v_min_i32_e32 v0, s1, v0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i32_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s1, s0, 0 -; GFX10-NEXT: s_sub_i32 s1, 0x7fffffff, s1 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 -; GFX10-NEXT: s_cselect_b32 s2, s0, 0 -; GFX10-NEXT: s_sub_i32 s2, 0x80000000, s2 -; GFX10-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX10-NEXT: v_min_i32_e32 v0, s1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1694,25 +1238,13 @@ define amdgpu_ps float @saddsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX9-LABEL: saddsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_i32_e32 v2, 0, v0 -; GFX9-NEXT: v_max_i32_e32 v1, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, 0x80000000, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_max_i32_e32 v2, s0, v2 -; GFX9-NEXT: v_min_i32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i32 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i32_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_i32_e32 v1, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v2, 0, v0 +; GFX10-NEXT: v_add_nc_i32 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x80000000, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x7fffffff, v2 -; GFX10-NEXT: v_max_i32_e32 v1, s0, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1723,7 +1255,7 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: s_brev_b32 s4, -2 @@ -1744,7 +1276,7 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 ; GFX8-NEXT: s_brev_b32 s4, -2 @@ -1765,45 +1297,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v4, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_max_i32_e32 v2, v5, v2 -; GFX9-NEXT: v_min_i32_e32 v2, v2, v4 -; GFX9-NEXT: v_min_i32_e32 v4, 0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_max_i32_e32 v3, v4, v3 -; GFX9-NEXT: v_min_i32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v4, 0, v0 -; GFX10-NEXT: v_min_i32_e32 v5, 0, v1 -; GFX10-NEXT: s_mov_b32 s4, 0x80000000 -; GFX10-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v7, 0, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s4, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, s4, v5 -; GFX10-NEXT: s_brev_b32 s4, -2 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s4, v6 -; GFX10-NEXT: v_max_i32_e32 v11, v4, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s4, v7 -; GFX10-NEXT: v_max_i32_e32 v10, v5, v3 -; GFX10-NEXT: v_min_i32_e32 v2, v11, v6 -; GFX10-NEXT: v_min_i32_e32 v3, v10, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1817,7 +1321,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX6-NEXT: s_cselect_b32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, 0 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, s2 @@ -1845,7 +1349,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX8-NEXT: s_cselect_b32 s6, s0, 0 ; GFX8-NEXT: s_sub_i32 s6, s4, s6 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_cselect_b32 s7, s0, 0 ; GFX8-NEXT: s_sub_i32 s7, s5, s7 ; GFX8-NEXT: s_cmp_gt_i32 s7, s2 @@ -1868,59 +1372,21 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX9-LABEL: s_saddsat_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: s_cselect_b32 s6, s0, 0 -; GFX9-NEXT: s_sub_i32 s6, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s7, s0, 0 -; GFX9-NEXT: s_sub_i32 s7, s5, s7 -; GFX9-NEXT: s_cmp_gt_i32 s7, s2 -; GFX9-NEXT: s_cselect_b32 s2, s7, s2 -; GFX9-NEXT: s_cmp_lt_i32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s2, s2, s6 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s2, s1, 0 -; GFX9-NEXT: s_sub_i32 s2, s4, s2 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s4, s1, 0 -; GFX9-NEXT: s_sub_i32 s4, s5, s4 -; GFX9-NEXT: s_cmp_gt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_cmp_lt_i32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: s_cselect_b32 s5, s0, 0 -; GFX10-NEXT: s_mov_b32 s6, 0x80000000 -; GFX10-NEXT: s_sub_i32 s5, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s2 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s0, 0 -; GFX10-NEXT: s_sub_i32 s7, s6, s7 -; GFX10-NEXT: s_cmp_gt_i32 s7, s2 -; GFX10-NEXT: s_cselect_b32 s2, s7, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s2, s1, 0 -; GFX10-NEXT: s_sub_i32 s2, s4, s2 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s4, s1, 0 -; GFX10-NEXT: s_sub_i32 s4, s6, s4 -; GFX10-NEXT: s_cmp_gt_i32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_cmp_lt_i32 s3, s2 -; GFX10-NEXT: s_cselect_b32 s2, s3, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1930,7 +1396,7 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7 ; GFX6-NEXT: s_brev_b32 s4, -2 @@ -1958,7 +1424,7 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v7, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 ; GFX8-NEXT: s_brev_b32 s4, -2 @@ -1986,59 +1452,19 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v7, s5, v7 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v6, s4, v6 -; GFX9-NEXT: v_max_i32_e32 v3, v7, v3 -; GFX9-NEXT: v_min_i32_e32 v3, v3, v6 -; GFX9-NEXT: v_min_i32_e32 v6, 0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_max_i32_e32 v3, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX9-NEXT: v_min_i32_e32 v3, v4, v3 -; GFX9-NEXT: v_min_i32_e32 v4, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_i32_e32 v3, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_max_i32_e32 v4, v4, v5 -; GFX9-NEXT: v_min_i32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_i32 v0, v0, v3 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v4 clamp +; GFX9-NEXT: v_add_i32 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX10-NEXT: v_min_i32_e32 v8, 0, v1 -; GFX10-NEXT: v_min_i32_e32 v9, 0, v2 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 -; GFX10-NEXT: v_max_i32_e32 v6, 0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v14, s5, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, s5, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, s5, v9 -; GFX10-NEXT: v_max_i32_e32 v10, 0, v1 -; GFX10-NEXT: v_max_i32_e32 v11, 0, v2 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_max_i32_e32 v3, v14, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s4, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s4, v10 -; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s4, v11 -; GFX10-NEXT: v_max_i32_e32 v5, v19, v5 -; GFX10-NEXT: v_min_i32_e32 v3, v3, v6 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v3 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v4 clamp +; GFX10-NEXT: v_add_nc_i32 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v4, v4, v7 -; GFX10-NEXT: v_min_i32_e32 v5, v5, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -2052,7 +1478,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX6-NEXT: s_cselect_b32 s8, s0, 0 ; GFX6-NEXT: s_sub_i32 s8, s6, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s7, 0x80000000 +; GFX6-NEXT: s_brev_b32 s7, 1 ; GFX6-NEXT: s_cselect_b32 s9, s0, 0 ; GFX6-NEXT: s_sub_i32 s9, s7, s9 ; GFX6-NEXT: s_cmp_gt_i32 s9, s3 @@ -2091,7 +1517,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX8-NEXT: s_cselect_b32 s8, s0, 0 ; GFX8-NEXT: s_sub_i32 s8, s6, s8 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s7, 0x80000000 +; GFX8-NEXT: s_brev_b32 s7, 1 ; GFX8-NEXT: s_cselect_b32 s9, s0, 0 ; GFX8-NEXT: s_sub_i32 s9, s7, s9 ; GFX8-NEXT: s_cmp_gt_i32 s9, s3 @@ -2125,81 +1551,26 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX9-LABEL: s_saddsat_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s6, -2 -; GFX9-NEXT: s_cselect_b32 s8, s0, 0 -; GFX9-NEXT: s_sub_i32 s8, s6, s8 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s7, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s9, s0, 0 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_cmp_gt_i32 s9, s3 -; GFX9-NEXT: s_cselect_b32 s3, s9, s3 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s3, s1, 0 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s8, s1, 0 -; GFX9-NEXT: s_sub_i32 s8, s7, s8 -; GFX9-NEXT: s_cmp_gt_i32 s8, s4 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s3, s2, 0 -; GFX9-NEXT: s_sub_i32 s3, s6, s3 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s4, s2, 0 -; GFX9-NEXT: s_sub_i32 s4, s7, s4 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s6, -2 -; GFX10-NEXT: s_cselect_b32 s7, s0, 0 -; GFX10-NEXT: s_mov_b32 s8, 0x80000000 -; GFX10-NEXT: s_sub_i32 s7, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s3 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s4 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s0, 0 -; GFX10-NEXT: s_sub_i32 s9, s8, s9 -; GFX10-NEXT: s_cmp_gt_i32 s9, s3 -; GFX10-NEXT: s_cselect_b32 s3, s9, s3 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s3, s1, 0 -; GFX10-NEXT: s_sub_i32 s3, s6, s3 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s7, s1, 0 -; GFX10-NEXT: s_sub_i32 s7, s8, s7 -; GFX10-NEXT: s_cmp_gt_i32 s7, s4 -; GFX10-NEXT: s_cselect_b32 s4, s7, s4 -; GFX10-NEXT: s_cmp_lt_i32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s3, s2, 0 -; GFX10-NEXT: s_sub_i32 s3, s6, s3 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s4, s2, 0 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -2209,7 +1580,7 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v9, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9 ; GFX6-NEXT: s_brev_b32 s4, -2 @@ -2244,7 +1615,7 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v9, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9 ; GFX8-NEXT: s_brev_b32 s4, -2 @@ -2279,73 +1650,21 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v9, s5, v9 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v8, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v8, s4, v8 -; GFX9-NEXT: v_max_i32_e32 v4, v9, v4 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX9-NEXT: v_min_i32_e32 v8, 0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_max_i32_e32 v4, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v8, s5, v8 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_max_i32_e32 v5, v8, v5 -; GFX9-NEXT: v_min_i32_e32 v4, v5, v4 -; GFX9-NEXT: v_min_i32_e32 v5, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_max_i32_e32 v4, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_min_i32_e32 v4, v5, v4 -; GFX9-NEXT: v_min_i32_e32 v5, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, 0x80000000, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_max_i32_e32 v4, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, 0x7fffffff, v4 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX9-NEXT: v_min_i32_e32 v4, v5, v4 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_i32 v0, v0, v4 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v5 clamp +; GFX9-NEXT: v_add_i32 v2, v2, v6 clamp +; GFX9-NEXT: v_add_i32 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v8, 0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0x80000000 -; GFX10-NEXT: v_min_i32_e32 v11, 0, v1 -; GFX10-NEXT: v_min_i32_e32 v12, 0, v3 -; GFX10-NEXT: v_max_i32_e32 v9, 0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, s4, v8 -; GFX10-NEXT: v_min_i32_e32 v8, 0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, s4, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, 0x80000000, v12 -; GFX10-NEXT: v_max_i32_e32 v10, 0, v1 -; GFX10-NEXT: v_max_i32_e32 v13, 0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s4, v8 -; GFX10-NEXT: v_max_i32_e32 v14, 0, v3 -; GFX10-NEXT: s_brev_b32 s5, -2 -; GFX10-NEXT: v_max_i32_e32 v5, v11, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, s5, v10 -; GFX10-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, s5, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, s5, v9 -; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, 0x7fffffff, v14 -; GFX10-NEXT: v_max_i32_e32 v7, v12, v7 -; GFX10-NEXT: v_min_i32_e32 v11, v6, v11 -; GFX10-NEXT: v_min_i32_e32 v19, v5, v10 -; GFX10-NEXT: v_min_i32_e32 v15, v4, v9 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v4 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v5 clamp +; GFX10-NEXT: v_add_nc_i32 v2, v2, v6 clamp +; GFX10-NEXT: v_add_nc_i32 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v6, v7, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v11 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -2359,7 +1678,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX6-NEXT: s_cselect_b32 s10, s0, 0 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, 0 ; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_cmp_gt_i32 s11, s4 @@ -2409,7 +1728,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX8-NEXT: s_cselect_b32 s10, s0, 0 ; GFX8-NEXT: s_sub_i32 s10, s8, s10 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s9, 0x80000000 +; GFX8-NEXT: s_brev_b32 s9, 1 ; GFX8-NEXT: s_cselect_b32 s11, s0, 0 ; GFX8-NEXT: s_sub_i32 s11, s9, s11 ; GFX8-NEXT: s_cmp_gt_i32 s11, s4 @@ -2454,103 +1773,31 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX9-LABEL: s_saddsat_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s8, -2 -; GFX9-NEXT: s_cselect_b32 s10, s0, 0 -; GFX9-NEXT: s_sub_i32 s10, s8, s10 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s9, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s11, s0, 0 -; GFX9-NEXT: s_sub_i32 s11, s9, s11 -; GFX9-NEXT: s_cmp_gt_i32 s11, s4 -; GFX9-NEXT: s_cselect_b32 s4, s11, s4 -; GFX9-NEXT: s_cmp_lt_i32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s4, s1, 0 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s10, s1, 0 -; GFX9-NEXT: s_sub_i32 s10, s9, s10 -; GFX9-NEXT: s_cmp_gt_i32 s10, s5 -; GFX9-NEXT: s_cselect_b32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s4, s2, 0 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s5, s2, 0 -; GFX9-NEXT: s_sub_i32 s5, s9, s5 -; GFX9-NEXT: s_cmp_gt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_cmp_gt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s4, s3, 0 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s5, s3, 0 -; GFX9-NEXT: s_sub_i32 s5, s9, s5 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s8, -2 -; GFX10-NEXT: s_cselect_b32 s9, s0, 0 -; GFX10-NEXT: s_mov_b32 s10, 0x80000000 -; GFX10-NEXT: s_sub_i32 s9, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s4 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s5 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s6 clamp +; GFX10-NEXT: v_add_nc_i32 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s11, s0, 0 -; GFX10-NEXT: s_sub_i32 s11, s10, s11 -; GFX10-NEXT: s_cmp_gt_i32 s11, s4 -; GFX10-NEXT: s_cselect_b32 s4, s11, s4 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s4, s1, 0 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s9, s1, 0 -; GFX10-NEXT: s_sub_i32 s9, s10, s9 -; GFX10-NEXT: s_cmp_gt_i32 s9, s5 -; GFX10-NEXT: s_cselect_b32 s5, s9, s5 -; GFX10-NEXT: s_cmp_lt_i32 s5, s4 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s4, s2, 0 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s5, s2, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_lt_i32 s5, s4 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s4, s3, 0 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s5, s3, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s4 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -2560,7 +1807,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v12, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 ; GFX6-NEXT: s_brev_b32 s4, -2 @@ -2582,7 +1829,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX6-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 @@ -2604,7 +1851,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v12, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12 ; GFX8-NEXT: s_brev_b32 s4, -2 @@ -2626,7 +1873,7 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 @@ -2648,90 +1895,22 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v12, s5, v12 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v10, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v10, s4, v10 -; GFX9-NEXT: v_max_i32_e32 v5, v12, v5 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v10 -; GFX9-NEXT: v_min_i32_e32 v10, 0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_max_i32_e32 v5, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v10, s5, v10 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_max_i32_e32 v6, v10, v6 -; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v6, v13, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_bfrev_b32_e32 v11, -2 -; GFX9-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, v11, v5 -; GFX9-NEXT: v_max_i32_e32 v6, v6, v8 -; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_min_i32_e32 v6, 0, v4 -; GFX9-NEXT: v_sub_u32_e32 v6, v13, v6 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, v11, v5 -; GFX9-NEXT: v_max_i32_e32 v6, v6, v9 -; GFX9-NEXT: v_min_i32_e32 v5, v6, v5 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_i32 v0, v0, v5 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v6 clamp +; GFX9-NEXT: v_add_i32 v2, v2, v7 clamp +; GFX9-NEXT: v_add_i32 v3, v3, v8 clamp +; GFX9-NEXT: v_add_i32 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v5i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v13, 0, v1 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 -; GFX10-NEXT: v_min_i32_e32 v10, 0, v0 -; GFX10-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX10-NEXT: v_mov_b32_e32 v15, 0x80000000 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, s5, v13 -; GFX10-NEXT: v_min_i32_e32 v17, 0, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, s5, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, s5, v16 -; GFX10-NEXT: v_max_i32_e32 v11, 0, v0 -; GFX10-NEXT: v_max_i32_e32 v23, v13, v6 -; GFX10-NEXT: v_min_i32_e32 v13, 0, v3 -; GFX10-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX10-NEXT: v_bfrev_b32_e32 v12, -2 -; GFX10-NEXT: v_max_i32_e32 v14, 0, v1 -; GFX10-NEXT: v_max_i32_e32 v10, 0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v15, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, v15, v17 -; GFX10-NEXT: v_max_i32_e32 v18, 0, v3 -; GFX10-NEXT: v_max_i32_e32 v19, 0, v4 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_max_i32_e32 v7, v16, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, s4, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v14, s4, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, s4, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v12, v18 -; GFX10-NEXT: v_max_i32_e32 v27, v13, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, v12, v19 -; GFX10-NEXT: v_max_i32_e32 v9, v15, v9 -; GFX10-NEXT: v_min_i32_e32 v5, v5, v11 -; GFX10-NEXT: v_min_i32_e32 v6, v23, v14 -; GFX10-NEXT: v_min_i32_e32 v7, v7, v10 -; GFX10-NEXT: v_min_i32_e32 v8, v27, v16 -; GFX10-NEXT: v_min_i32_e32 v9, v9, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v9 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v5 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v6 clamp +; GFX10-NEXT: v_add_nc_i32 v2, v2, v7 clamp +; GFX10-NEXT: v_add_nc_i32 v3, v3, v8 clamp +; GFX10-NEXT: v_add_nc_i32 v4, v4, v9 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) @@ -2746,7 +1925,7 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX6-NEXT: s_cselect_b32 s12, s0, 0 ; GFX6-NEXT: s_sub_i32 s12, s10, s12 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s11, 0x80000000 +; GFX6-NEXT: s_brev_b32 s11, 1 ; GFX6-NEXT: s_cselect_b32 s13, s0, 0 ; GFX6-NEXT: s_sub_i32 s13, s11, s13 ; GFX6-NEXT: s_cmp_gt_i32 s13, s5 @@ -2807,7 +1986,7 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX8-NEXT: s_cselect_b32 s12, s0, 0 ; GFX8-NEXT: s_sub_i32 s12, s10, s12 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s11, 0x80000000 +; GFX8-NEXT: s_brev_b32 s11, 1 ; GFX8-NEXT: s_cselect_b32 s13, s0, 0 ; GFX8-NEXT: s_sub_i32 s13, s11, s13 ; GFX8-NEXT: s_cmp_gt_i32 s13, s5 @@ -2863,125 +2042,36 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX9-LABEL: s_saddsat_v5i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s10, -2 -; GFX9-NEXT: s_cselect_b32 s12, s0, 0 -; GFX9-NEXT: s_sub_i32 s12, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s11, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s13, s0, 0 -; GFX9-NEXT: s_sub_i32 s13, s11, s13 -; GFX9-NEXT: s_cmp_gt_i32 s13, s5 -; GFX9-NEXT: s_cselect_b32 s5, s13, s5 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_add_i32 s0, s0, s5 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s5, s1, 0 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s12, s1, 0 -; GFX9-NEXT: s_sub_i32 s12, s11, s12 -; GFX9-NEXT: s_cmp_gt_i32 s12, s6 -; GFX9-NEXT: s_cselect_b32 s6, s12, s6 -; GFX9-NEXT: s_cmp_lt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s1, s1, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s5, s2, 0 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s6, s2, 0 -; GFX9-NEXT: s_sub_i32 s6, s11, s6 -; GFX9-NEXT: s_cmp_gt_i32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s6, s6, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s2, s2, s5 -; GFX9-NEXT: s_cmp_gt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s5, s3, 0 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s6, s3, 0 -; GFX9-NEXT: s_sub_i32 s6, s11, s6 -; GFX9-NEXT: s_cmp_gt_i32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s6, s6, s8 -; GFX9-NEXT: s_cmp_lt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s3, s3, s5 -; GFX9-NEXT: s_cmp_gt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s5, s4, 0 -; GFX9-NEXT: s_sub_i32 s5, s10, s5 -; GFX9-NEXT: s_cmp_lt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s6, s4, 0 -; GFX9-NEXT: s_sub_i32 s6, s11, s6 -; GFX9-NEXT: s_cmp_gt_i32 s6, s9 -; GFX9-NEXT: s_cselect_b32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v5i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s10, -2 -; GFX10-NEXT: s_cselect_b32 s11, s0, 0 -; GFX10-NEXT: s_mov_b32 s12, 0x80000000 -; GFX10-NEXT: s_sub_i32 s11, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s5 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s6 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s7 clamp +; GFX10-NEXT: v_add_nc_i32 v3, s3, s8 clamp +; GFX10-NEXT: v_add_nc_i32 v4, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s13, s0, 0 -; GFX10-NEXT: s_sub_i32 s13, s12, s13 -; GFX10-NEXT: s_cmp_gt_i32 s13, s5 -; GFX10-NEXT: s_cselect_b32 s5, s13, s5 -; GFX10-NEXT: s_cmp_lt_i32 s5, s11 -; GFX10-NEXT: s_cselect_b32 s5, s5, s11 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s5, s1, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s11, s1, 0 -; GFX10-NEXT: s_sub_i32 s11, s12, s11 -; GFX10-NEXT: s_cmp_gt_i32 s11, s6 -; GFX10-NEXT: s_cselect_b32 s6, s11, s6 -; GFX10-NEXT: s_cmp_lt_i32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s1, s1, s5 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s5, s2, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s6, s2, 0 -; GFX10-NEXT: s_sub_i32 s6, s12, s6 -; GFX10-NEXT: s_cmp_gt_i32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s2, s2, s5 -; GFX10-NEXT: s_cmp_gt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s5, s3, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s6, s3, 0 -; GFX10-NEXT: s_sub_i32 s6, s12, s6 -; GFX10-NEXT: s_cmp_gt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_cmp_lt_i32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s3, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s5, s4, 0 -; GFX10-NEXT: s_sub_i32 s5, s10, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s6, s4, 0 -; GFX10-NEXT: s_sub_i32 s6, s12, s6 -; GFX10-NEXT: s_cmp_gt_i32 s6, s9 -; GFX10-NEXT: s_cselect_b32 s6, s6, s9 -; GFX10-NEXT: s_cmp_lt_i32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_add_i32 s4, s4, s5 ; GFX10-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result @@ -2991,7 +2081,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0x80000000 +; GFX6-NEXT: s_brev_b32 s4, 1 ; GFX6-NEXT: v_min_i32_e32 v32, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, s4, v32 ; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 @@ -3014,7 +2104,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16 -; GFX6-NEXT: v_mov_b32_e32 v16, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 @@ -3112,7 +2202,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-LABEL: v_saddsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80000000 +; GFX8-NEXT: s_brev_b32 s4, 1 ; GFX8-NEXT: v_min_i32_e32 v32, 0, v0 ; GFX8-NEXT: v_sub_u32_e32 v32, vcc, s4, v32 ; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 @@ -3135,7 +2225,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_mov_b32_e32 v16, 0x80000000 +; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 @@ -3233,244 +2323,44 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX9-LABEL: v_saddsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v32, s4, v32 -; GFX9-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX9-NEXT: s_brev_b32 s5, -2 -; GFX9-NEXT: v_max_i32_e32 v32, 0, v0 -; GFX9-NEXT: v_sub_u32_e32 v32, s5, v32 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v32 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_min_i32_e32 v16, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v16, s4, v16 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v17, 0, v1 -; GFX9-NEXT: v_sub_u32_e32 v17, s5, v17 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_min_i32_e32 v16, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v16, s4, v16 -; GFX9-NEXT: v_max_i32_e32 v17, 0, v2 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX9-NEXT: v_sub_u32_e32 v17, s5, v17 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_mov_b32_e32 v16, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_bfrev_b32_e32 v18, -2 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v3 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v4 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v4 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v5 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v5 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v6 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v6 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v7 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v7 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v8 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v8 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v9 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v9 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v9, v9, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v10 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v10 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v10, v10, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v11 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v11 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v11, v11, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v12 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v12 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v13 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v13 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v17 -; GFX9-NEXT: v_min_i32_e32 v17, 0, v14 -; GFX9-NEXT: v_sub_u32_e32 v17, v16, v17 -; GFX9-NEXT: v_max_i32_e32 v19, 0, v14 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX9-NEXT: v_sub_u32_e32 v19, v18, v19 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v17 -; GFX9-NEXT: v_max_i32_e32 v17, 0, v15 -; GFX9-NEXT: v_sub_u32_e32 v17, v18, v17 -; GFX9-NEXT: v_min_i32_e32 v18, 0, v15 -; GFX9-NEXT: v_sub_u32_e32 v16, v16, v18 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v31 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_add_i32 v0, v0, v16 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v17 clamp +; GFX9-NEXT: v_add_i32 v2, v2, v18 clamp +; GFX9-NEXT: v_add_i32 v3, v3, v19 clamp +; GFX9-NEXT: v_add_i32 v4, v4, v20 clamp +; GFX9-NEXT: v_add_i32 v5, v5, v21 clamp +; GFX9-NEXT: v_add_i32 v6, v6, v22 clamp +; GFX9-NEXT: v_add_i32 v7, v7, v23 clamp +; GFX9-NEXT: v_add_i32 v8, v8, v24 clamp +; GFX9-NEXT: v_add_i32 v9, v9, v25 clamp +; GFX9-NEXT: v_add_i32 v10, v10, v26 clamp +; GFX9-NEXT: v_add_i32 v11, v11, v27 clamp +; GFX9-NEXT: v_add_i32 v12, v12, v28 clamp +; GFX9-NEXT: v_add_i32 v13, v13, v29 clamp +; GFX9-NEXT: v_add_i32 v14, v14, v30 clamp +; GFX9-NEXT: v_add_i32 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX10-NEXT: s_mov_b32 s4, 0x80000000 -; GFX10-NEXT: v_max_i32_e32 v33, 0, v0 -; GFX10-NEXT: s_brev_b32 s5, -2 -; GFX10-NEXT: v_min_i32_e32 v36, 0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v35, s4, v32 -; GFX10-NEXT: v_min_i32_e32 v32, 0, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v33, s5, v33 -; GFX10-NEXT: v_max_i32_e32 v37, 0, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v36, s4, v36 -; GFX10-NEXT: v_max_i32_e32 v16, v35, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v32, s4, v32 -; GFX10-NEXT: v_mov_b32_e32 v35, 0x80000000 -; GFX10-NEXT: v_min_i32_e32 v38, 0, v3 -; GFX10-NEXT: v_max_i32_e32 v18, v36, v18 -; GFX10-NEXT: v_min_i32_e32 v16, v16, v33 -; GFX10-NEXT: v_max_i32_e32 v33, 0, v2 -; GFX10-NEXT: v_max_i32_e32 v39, v32, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v36, v35, v38 -; GFX10-NEXT: v_sub_nc_u32_e32 v37, s5, v37 -; GFX10-NEXT: v_bfrev_b32_e32 v34, -2 -; GFX10-NEXT: v_sub_nc_u32_e32 v32, s5, v33 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v16 -; GFX10-NEXT: v_max_i32_e32 v33, 0, v3 -; GFX10-NEXT: v_min_i32_e32 v39, v39, v37 -; GFX10-NEXT: v_max_i32_e32 v19, v36, v19 -; GFX10-NEXT: v_min_i32_e32 v16, v18, v32 -; GFX10-NEXT: v_min_i32_e32 v32, 0, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v34, v33 -; GFX10-NEXT: v_min_i32_e32 v38, 0, v5 -; GFX10-NEXT: v_max_i32_e32 v17, 0, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v16 -; GFX10-NEXT: v_min_i32_e32 v16, 0, v4 -; GFX10-NEXT: v_min_i32_e32 v18, v19, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v35, v38 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v39 -; GFX10-NEXT: v_sub_nc_u32_e32 v32, v35, v32 -; GFX10-NEXT: v_sub_nc_u32_e32 v39, v35, v16 -; GFX10-NEXT: v_max_i32_e32 v33, 0, v5 -; GFX10-NEXT: v_max_i32_e32 v36, 0, v6 -; GFX10-NEXT: v_max_i32_e32 v19, v19, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v18 -; GFX10-NEXT: v_max_i32_e32 v16, v39, v20 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v34, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v34, v33 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v34, v36 -; GFX10-NEXT: v_max_i32_e32 v22, v32, v22 -; GFX10-NEXT: v_min_i32_e32 v18, 0, v7 -; GFX10-NEXT: v_min_i32_e32 v39, v16, v17 -; GFX10-NEXT: v_min_i32_e32 v38, v19, v20 -; GFX10-NEXT: v_max_i32_e32 v16, 0, v7 -; GFX10-NEXT: v_min_i32_e32 v19, v22, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v35, v18 -; GFX10-NEXT: v_min_i32_e32 v18, 0, v8 -; GFX10-NEXT: v_min_i32_e32 v20, 0, v9 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v34, v16 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v19 -; GFX10-NEXT: v_max_i32_e32 v19, 0, v8 -; GFX10-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v35, v18 -; GFX10-NEXT: v_min_i32_e32 v22, 0, v10 -; GFX10-NEXT: v_max_i32_e32 v21, 0, v9 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v35, v20 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v39 -; GFX10-NEXT: v_max_i32_e32 v18, v18, v24 -; GFX10-NEXT: v_sub_nc_u32_e32 v39, v35, v22 -; GFX10-NEXT: v_min_i32_e32 v16, v17, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v34, v19 -; GFX10-NEXT: v_max_i32_e32 v23, 0, v10 -; GFX10-NEXT: v_max_i32_e32 v20, v20, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v34, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v16 -; GFX10-NEXT: v_min_i32_e32 v17, v18, v19 -; GFX10-NEXT: v_min_i32_e32 v16, 0, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v34, v23 -; GFX10-NEXT: v_max_i32_e32 v19, v39, v26 -; GFX10-NEXT: v_min_i32_e32 v22, 0, v12 -; GFX10-NEXT: v_min_i32_e32 v20, v20, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v35, v16 -; GFX10-NEXT: v_min_i32_e32 v26, 0, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v17 -; GFX10-NEXT: v_min_i32_e32 v17, v19, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v35, v22 -; GFX10-NEXT: v_min_i32_e32 v22, 0, v14 -; GFX10-NEXT: v_min_i32_e32 v21, 0, v13 -; GFX10-NEXT: v_max_i32_e32 v24, 0, v14 -; GFX10-NEXT: v_max_i32_e32 v25, 0, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v20 -; GFX10-NEXT: v_max_i32_e32 v20, 0, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v39, v35, v22 -; GFX10-NEXT: v_max_i32_e32 v23, 0, v11 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v10, v17 -; GFX10-NEXT: v_max_i32_e32 v17, 0, v12 -; GFX10-NEXT: v_max_i32_e32 v16, v16, v27 -; GFX10-NEXT: v_sub_nc_u32_e32 v27, v35, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v26, v35, v26 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v34, v23 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v34, v17 -; GFX10-NEXT: v_max_i32_e32 v19, v19, v28 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v34, v20 -; GFX10-NEXT: v_max_i32_e32 v21, v27, v29 -; GFX10-NEXT: v_sub_nc_u32_e32 v24, v34, v24 -; GFX10-NEXT: v_max_i32_e32 v22, v39, v30 -; GFX10-NEXT: v_sub_nc_u32_e32 v25, v34, v25 -; GFX10-NEXT: v_max_i32_e32 v23, v26, v31 -; GFX10-NEXT: v_min_i32_e32 v16, v16, v18 -; GFX10-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX10-NEXT: v_min_i32_e32 v18, v21, v20 -; GFX10-NEXT: v_min_i32_e32 v19, v22, v24 -; GFX10-NEXT: v_min_i32_e32 v20, v23, v25 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v38 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v16 -; GFX10-NEXT: v_add_nc_u32_e32 v12, v12, v17 -; GFX10-NEXT: v_add_nc_u32_e32 v13, v13, v18 -; GFX10-NEXT: v_add_nc_u32_e32 v14, v14, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v15, v15, v20 +; GFX10-NEXT: v_add_nc_i32 v0, v0, v16 clamp +; GFX10-NEXT: v_add_nc_i32 v1, v1, v17 clamp +; GFX10-NEXT: v_add_nc_i32 v2, v2, v18 clamp +; GFX10-NEXT: v_add_nc_i32 v3, v3, v19 clamp +; GFX10-NEXT: v_add_nc_i32 v4, v4, v20 clamp +; GFX10-NEXT: v_add_nc_i32 v5, v5, v21 clamp +; GFX10-NEXT: v_add_nc_i32 v6, v6, v22 clamp +; GFX10-NEXT: v_add_nc_i32 v7, v7, v23 clamp +; GFX10-NEXT: v_add_nc_i32 v8, v8, v24 clamp +; GFX10-NEXT: v_add_nc_i32 v9, v9, v25 clamp +; GFX10-NEXT: v_add_nc_i32 v10, v10, v26 clamp +; GFX10-NEXT: v_add_nc_i32 v11, v11, v27 clamp +; GFX10-NEXT: v_add_nc_i32 v12, v12, v28 clamp +; GFX10-NEXT: v_add_nc_i32 v13, v13, v29 clamp +; GFX10-NEXT: v_add_nc_i32 v14, v14, v30 clamp +; GFX10-NEXT: v_add_nc_i32 v15, v15, v31 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) @@ -3485,7 +2375,7 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX6-NEXT: s_cselect_b32 s34, s0, 0 ; GFX6-NEXT: s_sub_i32 s34, s32, s34 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s33, 0x80000000 +; GFX6-NEXT: s_brev_b32 s33, 1 ; GFX6-NEXT: s_cselect_b32 s35, s0, 0 ; GFX6-NEXT: s_sub_i32 s35, s33, s35 ; GFX6-NEXT: s_cmp_gt_i32 s35, s16 @@ -3667,7 +2557,7 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX8-NEXT: s_cselect_b32 s34, s0, 0 ; GFX8-NEXT: s_sub_i32 s34, s32, s34 ; GFX8-NEXT: s_cmp_lt_i32 s0, 0 -; GFX8-NEXT: s_mov_b32 s33, 0x80000000 +; GFX8-NEXT: s_brev_b32 s33, 1 ; GFX8-NEXT: s_cselect_b32 s35, s0, 0 ; GFX8-NEXT: s_sub_i32 s35, s33, s35 ; GFX8-NEXT: s_cmp_gt_i32 s35, s16 @@ -3844,367 +2734,91 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX9-LABEL: s_saddsat_v16i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, 0 -; GFX9-NEXT: s_brev_b32 s32, -2 -; GFX9-NEXT: s_cselect_b32 s34, s0, 0 -; GFX9-NEXT: s_sub_i32 s34, s32, s34 -; GFX9-NEXT: s_cmp_lt_i32 s0, 0 -; GFX9-NEXT: s_mov_b32 s33, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s35, s0, 0 -; GFX9-NEXT: s_sub_i32 s35, s33, s35 -; GFX9-NEXT: s_cmp_gt_i32 s35, s16 -; GFX9-NEXT: s_cselect_b32 s16, s35, s16 -; GFX9-NEXT: s_cmp_lt_i32 s16, s34 -; GFX9-NEXT: s_cselect_b32 s16, s16, s34 -; GFX9-NEXT: s_add_i32 s0, s0, s16 -; GFX9-NEXT: s_cmp_gt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s16, s1, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b32 s34, s1, 0 -; GFX9-NEXT: s_sub_i32 s34, s33, s34 -; GFX9-NEXT: s_cmp_gt_i32 s34, s17 -; GFX9-NEXT: s_cselect_b32 s17, s34, s17 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s1, s1, s16 -; GFX9-NEXT: s_cmp_gt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s16, s2, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s2, 0 -; GFX9-NEXT: s_cselect_b32 s17, s2, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s18 -; GFX9-NEXT: s_cselect_b32 s17, s17, s18 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s2, s2, s16 -; GFX9-NEXT: s_cmp_gt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s16, s3, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b32 s17, s3, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s19 -; GFX9-NEXT: s_cselect_b32 s17, s17, s19 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s3, s3, s16 -; GFX9-NEXT: s_cmp_gt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s16, s4, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s17, s4, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s20 -; GFX9-NEXT: s_cselect_b32 s17, s17, s20 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s4, s4, s16 -; GFX9-NEXT: s_cmp_gt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s16, s5, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s5, 0 -; GFX9-NEXT: s_cselect_b32 s17, s5, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s21 -; GFX9-NEXT: s_cselect_b32 s17, s17, s21 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s5, s5, s16 -; GFX9-NEXT: s_cmp_gt_i32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s16, s6, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s17, s6, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s22 -; GFX9-NEXT: s_cselect_b32 s17, s17, s22 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s6, s6, s16 -; GFX9-NEXT: s_cmp_gt_i32 s7, 0 -; GFX9-NEXT: s_cselect_b32 s16, s7, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s7, 0 -; GFX9-NEXT: s_cselect_b32 s17, s7, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s23 -; GFX9-NEXT: s_cselect_b32 s17, s17, s23 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s7, s7, s16 -; GFX9-NEXT: s_cmp_gt_i32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s16, s8, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s17, s8, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s24 -; GFX9-NEXT: s_cselect_b32 s17, s17, s24 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s8, s8, s16 -; GFX9-NEXT: s_cmp_gt_i32 s9, 0 -; GFX9-NEXT: s_cselect_b32 s16, s9, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s9, 0 -; GFX9-NEXT: s_cselect_b32 s17, s9, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s25 -; GFX9-NEXT: s_cselect_b32 s17, s17, s25 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s9, s9, s16 -; GFX9-NEXT: s_cmp_gt_i32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s16, s10, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s17, s10, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s26 -; GFX9-NEXT: s_cselect_b32 s17, s17, s26 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s10, s10, s16 -; GFX9-NEXT: s_cmp_gt_i32 s11, 0 -; GFX9-NEXT: s_cselect_b32 s16, s11, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s11, 0 -; GFX9-NEXT: s_cselect_b32 s17, s11, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s27 -; GFX9-NEXT: s_cselect_b32 s17, s17, s27 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s11, s11, s16 -; GFX9-NEXT: s_cmp_gt_i32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s16, s12, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s17, s12, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s28 -; GFX9-NEXT: s_cselect_b32 s17, s17, s28 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_gt_i32 s13, 0 -; GFX9-NEXT: s_cselect_b32 s16, s13, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s13, 0 -; GFX9-NEXT: s_cselect_b32 s17, s13, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s29 -; GFX9-NEXT: s_cselect_b32 s17, s17, s29 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s13, s13, s16 -; GFX9-NEXT: s_cmp_gt_i32 s14, 0 -; GFX9-NEXT: s_cselect_b32 s16, s14, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s14, 0 -; GFX9-NEXT: s_cselect_b32 s17, s14, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s30 -; GFX9-NEXT: s_cselect_b32 s17, s17, s30 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_gt_i32 s15, 0 -; GFX9-NEXT: s_cselect_b32 s16, s15, 0 -; GFX9-NEXT: s_sub_i32 s16, s32, s16 -; GFX9-NEXT: s_cmp_lt_i32 s15, 0 -; GFX9-NEXT: s_cselect_b32 s17, s15, 0 -; GFX9-NEXT: s_sub_i32 s17, s33, s17 -; GFX9-NEXT: s_cmp_gt_i32 s17, s31 -; GFX9-NEXT: s_cselect_b32 s17, s17, s31 -; GFX9-NEXT: s_cmp_lt_i32 s17, s16 -; GFX9-NEXT: s_cselect_b32 s16, s17, s16 -; GFX9-NEXT: s_add_i32 s15, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp +; GFX9-NEXT: v_add_i32 v5, s5, v5 clamp +; GFX9-NEXT: v_add_i32 v6, s6, v6 clamp +; GFX9-NEXT: v_add_i32 v7, s7, v7 clamp +; GFX9-NEXT: v_add_i32 v8, s8, v8 clamp +; GFX9-NEXT: v_add_i32 v9, s9, v9 clamp +; GFX9-NEXT: v_add_i32 v10, s10, v10 clamp +; GFX9-NEXT: v_add_i32 v11, s11, v11 clamp +; GFX9-NEXT: v_add_i32 v12, s12, v12 clamp +; GFX9-NEXT: v_add_i32 v13, s13, v13 clamp +; GFX9-NEXT: v_add_i32 v14, s14, v14 clamp +; GFX9-NEXT: v_add_i32 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v16i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, 0 -; GFX10-NEXT: s_brev_b32 s32, -2 -; GFX10-NEXT: s_cselect_b32 s33, s0, 0 -; GFX10-NEXT: s_mov_b32 s34, 0x80000000 -; GFX10-NEXT: s_sub_i32 s46, s32, s33 -; GFX10-NEXT: s_cmp_lt_i32 s0, 0 +; GFX10-NEXT: v_add_nc_i32 v0, s0, s16 clamp +; GFX10-NEXT: v_add_nc_i32 v1, s1, s17 clamp +; GFX10-NEXT: v_add_nc_i32 v2, s2, s18 clamp +; GFX10-NEXT: v_add_nc_i32 v3, s3, s19 clamp +; GFX10-NEXT: v_add_nc_i32 v4, s4, s20 clamp +; GFX10-NEXT: v_add_nc_i32 v5, s5, s21 clamp +; GFX10-NEXT: v_add_nc_i32 v6, s6, s22 clamp +; GFX10-NEXT: v_add_nc_i32 v7, s7, s23 clamp +; GFX10-NEXT: v_add_nc_i32 v8, s8, s24 clamp +; GFX10-NEXT: v_add_nc_i32 v9, s9, s25 clamp +; GFX10-NEXT: v_add_nc_i32 v10, s10, s26 clamp +; GFX10-NEXT: v_add_nc_i32 v11, s11, s27 clamp +; GFX10-NEXT: v_add_nc_i32 v12, s12, s28 clamp +; GFX10-NEXT: v_add_nc_i32 v13, s13, s29 clamp +; GFX10-NEXT: v_add_nc_i32 v14, s14, s30 clamp +; GFX10-NEXT: v_add_nc_i32 v15, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 +; GFX10-NEXT: v_readfirstlane_b32 s14, v14 +; GFX10-NEXT: v_readfirstlane_b32 s15, v15 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s35, s0, 0 -; GFX10-NEXT: s_sub_i32 s35, s34, s35 -; GFX10-NEXT: s_cmp_gt_i32 s35, s16 -; GFX10-NEXT: s_cselect_b32 s16, s35, s16 -; GFX10-NEXT: s_cmp_lt_i32 s16, s46 -; GFX10-NEXT: s_cselect_b32 s46, s16, s46 -; GFX10-NEXT: s_add_i32 s0, s0, s46 -; GFX10-NEXT: s_cmp_gt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s46, s1, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: s_cselect_b32 s33, s1, 0 -; GFX10-NEXT: s_sub_i32 s46, s34, s33 -; GFX10-NEXT: s_cmp_gt_i32 s46, s17 -; GFX10-NEXT: s_cselect_b32 s17, s46, s17 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s46, s17, s16 -; GFX10-NEXT: s_add_i32 s1, s1, s46 -; GFX10-NEXT: s_cmp_gt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s46, s2, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s2, 0 -; GFX10-NEXT: s_cselect_b32 s17, s2, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s2, s2, s16 -; GFX10-NEXT: s_cmp_gt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s46, s3, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s17, s3, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s19 -; GFX10-NEXT: s_cselect_b32 s17, s17, s19 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s3, s3, s16 -; GFX10-NEXT: s_cmp_gt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s46, s4, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s17, s4, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s20 -; GFX10-NEXT: s_cselect_b32 s17, s17, s20 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s4, s4, s16 -; GFX10-NEXT: s_cmp_gt_i32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s46, s5, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s5, 0 -; GFX10-NEXT: s_cselect_b32 s17, s5, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s21 -; GFX10-NEXT: s_cselect_b32 s17, s17, s21 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_gt_i32 s6, 0 -; GFX10-NEXT: s_cselect_b32 s46, s6, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s6, 0 -; GFX10-NEXT: s_cselect_b32 s17, s6, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s22 -; GFX10-NEXT: s_cselect_b32 s17, s17, s22 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s6, s6, s16 -; GFX10-NEXT: s_cmp_gt_i32 s7, 0 -; GFX10-NEXT: s_cselect_b32 s46, s7, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s7, 0 -; GFX10-NEXT: s_cselect_b32 s17, s7, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s23 -; GFX10-NEXT: s_cselect_b32 s17, s17, s23 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s7, s7, s16 -; GFX10-NEXT: s_cmp_gt_i32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s46, s8, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s17, s8, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s24 -; GFX10-NEXT: s_cselect_b32 s17, s17, s24 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s8, s8, s16 -; GFX10-NEXT: s_cmp_gt_i32 s9, 0 -; GFX10-NEXT: s_cselect_b32 s46, s9, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s9, 0 -; GFX10-NEXT: s_cselect_b32 s17, s9, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s25 -; GFX10-NEXT: s_cselect_b32 s17, s17, s25 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s9, s9, s16 -; GFX10-NEXT: s_cmp_gt_i32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s46, s10, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s17, s10, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s26 -; GFX10-NEXT: s_cselect_b32 s17, s17, s26 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s10, s10, s16 -; GFX10-NEXT: s_cmp_gt_i32 s11, 0 -; GFX10-NEXT: s_cselect_b32 s46, s11, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s11, 0 -; GFX10-NEXT: s_cselect_b32 s17, s11, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s27 -; GFX10-NEXT: s_cselect_b32 s17, s17, s27 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_gt_i32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s46, s12, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s17, s12, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s28 -; GFX10-NEXT: s_cselect_b32 s17, s17, s28 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_gt_i32 s13, 0 -; GFX10-NEXT: s_cselect_b32 s46, s13, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s13, 0 -; GFX10-NEXT: s_cselect_b32 s17, s13, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s29 -; GFX10-NEXT: s_cselect_b32 s17, s17, s29 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s13, s13, s16 -; GFX10-NEXT: s_cmp_gt_i32 s14, 0 -; GFX10-NEXT: s_cselect_b32 s46, s14, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s46 -; GFX10-NEXT: s_cmp_lt_i32 s14, 0 -; GFX10-NEXT: s_cselect_b32 s17, s14, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s30 -; GFX10-NEXT: s_cselect_b32 s17, s17, s30 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s14, s14, s16 -; GFX10-NEXT: s_cmp_gt_i32 s15, 0 -; GFX10-NEXT: s_cselect_b32 s30, s15, 0 -; GFX10-NEXT: s_sub_i32 s16, s32, s30 -; GFX10-NEXT: s_cmp_lt_i32 s15, 0 -; GFX10-NEXT: s_cselect_b32 s17, s15, 0 -; GFX10-NEXT: s_sub_i32 s17, s34, s17 -; GFX10-NEXT: s_cmp_gt_i32 s17, s31 -; GFX10-NEXT: s_cselect_b32 s17, s17, s31 -; GFX10-NEXT: s_cmp_lt_i32 s17, s16 -; GFX10-NEXT: s_cselect_b32 s16, s17, s16 -; GFX10-NEXT: s_add_i32 s15, s15, s16 ; GFX10-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -4241,27 +2855,15 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-LABEL: v_saddsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX9-NEXT: v_max_i16_e32 v2, 0, v0 -; GFX9-NEXT: v_sub_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_max_i16_e32 v1, v3, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i16_e64 v2, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v3, v0, 0 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v2, 0x8000, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, 0x7fff, v3 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -4295,7 +2897,7 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_cmp_lt_i32 s2, s3 ; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s2, s1 @@ -4309,45 +2911,16 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: s_saddsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s2, s0 -; GFX9-NEXT: s_sext_i32_i16 s3, 0 -; GFX9-NEXT: s_cmp_gt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s2, s3 -; GFX9-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, 0x8000, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s2, 0 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s2 +; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s3, s2 -; GFX10-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s2 -; GFX10-NEXT: s_cselect_b32 s2, s3, s2 -; GFX10-NEXT: s_sub_i32 s2, 0x8000, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_gt_i32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_sext_i32_i16 s2, s4 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -4379,7 +2952,7 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_cmp_lt_i32 s1, s2 ; GFX8-NEXT: s_cselect_b32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s1, 0x8000, s1 +; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1 ; GFX8-NEXT: v_max_i16_e32 v0, s1, v0 ; GFX8-NEXT: v_min_i16_e32 v0, s3, v0 ; GFX8-NEXT: v_add_u16_e32 v0, s0, v0 @@ -4387,33 +2960,13 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX9-LABEL: saddsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GFX9-NEXT: s_sext_i32_i16 s2, 0 -; GFX9-NEXT: s_cmp_gt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s3, s1, s2 -; GFX9-NEXT: s_sub_i32 s3, 0x7fff, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s1, 0x8000, s1 -; GFX9-NEXT: v_max_i16_e32 v0, s1, v0 -; GFX9-NEXT: v_min_i16_e32 v0, s3, v0 -; GFX9-NEXT: v_add_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s1, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, 0 +; GFX10-NEXT: v_add_nc_i16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_gt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s3, s1, s2 -; GFX10-NEXT: s_sub_i32 s3, 0x7fff, s3 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s1, 0x8000, s1 -; GFX10-NEXT: v_max_i16_e64 v0, s1, v0 -; GFX10-NEXT: v_min_i16_e64 v0, v0, s3 -; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -4448,25 +3001,13 @@ define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: saddsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_i16_e32 v2, 0, v0 -; GFX9-NEXT: v_max_i16_e32 v1, 0, v0 -; GFX9-NEXT: v_sub_u16_e32 v2, 0x8000, v2 -; GFX9-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_max_i16_e32 v2, s0, v2 -; GFX9-NEXT: v_min_i16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_i16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_i16_e64 v1, v0, 0 -; GFX10-NEXT: v_max_i16_e64 v2, v0, 0 +; GFX10-NEXT: v_add_nc_i16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v1, 0x8000, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, 0x7fff, v2 -; GFX10-NEXT: v_max_i16_e64 v1, v1, s0 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v2 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -4478,7 +3019,7 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 @@ -4504,7 +3045,7 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4527,37 +3068,15 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, s5, v3 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v2, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, s4, v2 -; GFX9-NEXT: v_pk_max_i16 v1, v3, v1 -; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX10-NEXT: v_pk_min_i16 v2, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_max_i16 v3, v0, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff +; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, s5, v2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s6 -; GFX10-NEXT: v_pk_sub_i16 v3, s4, v3 -; GFX10-NEXT: v_pk_max_i16 v1, v2, v1 -; GFX10-NEXT: v_pk_min_i16 v1, v1, v3 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result @@ -4573,7 +3092,7 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_cselect_b32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, 0 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, s2 @@ -4614,7 +3133,7 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_cselect_b32 s8, s6, s7 ; GFX8-NEXT: s_sub_i32 s8, s4, s8 ; GFX8-NEXT: s_cmp_lt_i32 s6, s7 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: s_cselect_b32 s6, s6, s7 ; GFX8-NEXT: s_sub_i32 s6, s5, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -4650,115 +3169,16 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_saddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s7, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 -; GFX9-NEXT: s_ashr_i32 s6, s0, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s8, s5, s7 -; GFX9-NEXT: s_cmp_gt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s9, s2, 16 -; GFX9-NEXT: s_lshr_b32 s10, s8, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s8 -; GFX9-NEXT: s_sub_i32 s8, s9, s10 -; GFX9-NEXT: s_cmp_lt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, s4 -; GFX9-NEXT: s_mov_b32 s3, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s3 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 -; GFX10-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s5 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s6, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX10-NEXT: s_cselect_b32 s8, s4, s2 +; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_lshr_b32 s8, s7, 16 -; GFX10-NEXT: s_lshr_b32 s9, s6, 16 -; GFX10-NEXT: s_sub_i32 s6, s7, s6 -; GFX10-NEXT: s_sub_i32 s7, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX10-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, s4, s2 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_sext_i32_i16 s3, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_gt_i32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to i32 @@ -4774,7 +3194,7 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: s_cselect_b32 s4, s0, 0 ; GFX6-NEXT: s_sub_i32 s4, s2, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: s_cselect_b32 s5, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_sub_i32 s5, s3, s5 @@ -4811,7 +3231,7 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: s_cselect_b32 s6, s4, s5 ; GFX8-NEXT: s_sub_i32 s6, s2, s6 ; GFX8-NEXT: s_cmp_lt_i32 s4, s5 -; GFX8-NEXT: s_mov_b32 s3, 0x8000 +; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s4, s3, s4 ; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 @@ -4834,75 +3254,13 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: s_ashr_i32 s5, s0, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s7, s4, s6 -; GFX9-NEXT: s_cmp_gt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_cselect_b32 s8, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s7, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s7 -; GFX9-NEXT: s_sub_i32 s7, s8, s9 -; GFX9-NEXT: s_cmp_lt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s3 -; GFX9-NEXT: s_mov_b32 s2, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: v_pk_max_i16 v0, s2, v0 -; GFX9-NEXT: v_pk_min_i16 v0, v0, s1 -; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_ashr_i32 s3, s0, 16 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff -; GFX10-NEXT: s_cselect_b32 s5, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: s_cselect_b32 s7, s3, s1 +; GFX10-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX10-NEXT: s_lshr_b32 s7, s6, 16 -; GFX10-NEXT: s_lshr_b32 s8, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, s6, s5 -; GFX10-NEXT: s_sub_i32 s6, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s1 -; GFX10-NEXT: s_mov_b32 s4, 0xffff8000 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_sub_i32 s1, s3, s1 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX10-NEXT: v_pk_max_i16 v0, s1, v0 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX10-NEXT: v_pk_min_i16 v0, v0, s1 -; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -4913,7 +3271,7 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 @@ -4943,7 +3301,7 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: saddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0x8000 +; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v3, s3, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -4966,35 +3324,13 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, 0 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: v_pk_min_i16 v2, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: v_pk_max_i16 v1, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 -; GFX9-NEXT: v_pk_max_i16 v2, v2, s0 -; GFX9-NEXT: v_pk_min_i16 v1, v2, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_i16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, 0, 0 -; GFX10-NEXT: s_mov_b32 s2, 0xffff8000 -; GFX10-NEXT: v_pk_min_i16 v1, v0, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX10-NEXT: v_pk_max_i16 v2, v0, s1 -; GFX10-NEXT: s_movk_i32 s3, 0x7fff +; GFX10-NEXT: v_pk_add_i16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v1, s2, v1 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s3 -; GFX10-NEXT: v_pk_sub_i16 v2, s1, v2 -; GFX10-NEXT: v_pk_max_i16 v1, v1, s0 -; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -5017,7 +3353,7 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 @@ -5046,7 +3382,7 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 @@ -5074,7 +3410,7 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v7, s5, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 @@ -5113,51 +3449,17 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v5, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v4, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 -; GFX9-NEXT: v_pk_max_i16 v2, v5, v2 -; GFX9-NEXT: v_pk_min_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_min_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s5, v4 -; GFX9-NEXT: v_pk_sub_i16 v2, s4, v2 -; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_min_i16 v2, v3, v2 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 0, 0 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 -; GFX10-NEXT: v_pk_min_i16 v4, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v5, v1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v7, v1, s5 -; GFX10-NEXT: v_pk_sub_i16 v4, s6, v4 -; GFX10-NEXT: v_pk_sub_i16 v5, s6, v5 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff +; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 -; GFX10-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX10-NEXT: v_pk_sub_i16 v4, s4, v7 -; GFX10-NEXT: v_pk_max_i16 v3, v5, v3 -; GFX10-NEXT: v_pk_min_i16 v2, v11, v6 -; GFX10-NEXT: v_pk_min_i16 v3, v3, v4 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> @@ -5174,7 +3476,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_cselect_b32 s10, s0, 0 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, 0 ; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_cmp_gt_i32 s11, s4 @@ -5249,7 +3551,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_cselect_b32 s12, s10, s11 ; GFX8-NEXT: s_sub_i32 s12, s8, s12 ; GFX8-NEXT: s_cmp_lt_i32 s10, s11 -; GFX8-NEXT: s_mov_b32 s9, 0x8000 +; GFX8-NEXT: s_movk_i32 s9, 0x8000 ; GFX8-NEXT: s_cselect_b32 s10, s10, s11 ; GFX8-NEXT: s_sub_i32 s10, s9, s10 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 @@ -5321,201 +3623,21 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_saddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s9, s6 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s10, s7, s9 -; GFX9-NEXT: s_cmp_gt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_cselect_b32 s11, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_sub_i32 s10, s4, s10 -; GFX9-NEXT: s_sub_i32 s12, s11, s12 -; GFX9-NEXT: s_cmp_lt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s8, s6 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s8, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_sub_i32 s7, s5, s7 -; GFX9-NEXT: s_sub_i32 s12, s8, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_sext_i32_i16 s12, s7 -; GFX9-NEXT: s_sext_i32_i16 s13, s2 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s12, s13 -; GFX9-NEXT: s_cselect_b32 s12, s12, s13 -; GFX9-NEXT: s_cmp_gt_i32 s7, s2 -; GFX9-NEXT: s_cselect_b32 s2, s7, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX9-NEXT: s_sext_i32_i16 s7, s2 -; GFX9-NEXT: s_sext_i32_i16 s12, s10 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_lt_i32 s7, s12 -; GFX9-NEXT: s_cselect_b32 s7, s7, s12 -; GFX9-NEXT: s_cmp_lt_i32 s2, s10 -; GFX9-NEXT: s_cselect_b32 s2, s2, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_add_i32 s7, s7, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s7, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s10, s2, s9 -; GFX9-NEXT: s_cmp_gt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s12, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_sub_i32 s4, s4, s10 -; GFX9-NEXT: s_sub_i32 s10, s11, s12 -; GFX9-NEXT: s_cmp_lt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s2, s2, s9 -; GFX9-NEXT: s_cmp_lt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_sub_i32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_gt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_ashr_i32 s6, s0, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_movk_i32 s9, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX10-NEXT: s_cselect_b32 s10, s6, s4 -; GFX10-NEXT: s_mov_b32 s12, 0xffff8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_lshr_b32 s10, s9, 16 -; GFX10-NEXT: s_lshr_b32 s11, s8, 16 -; GFX10-NEXT: s_sub_i32 s8, s9, s8 -; GFX10-NEXT: s_sub_i32 s11, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s12 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_sext_i32_i16 s14, s2 -; GFX10-NEXT: s_cselect_b32 s6, s6, s4 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_lshr_b32 s6, s12, 16 -; GFX10-NEXT: s_lshr_b32 s13, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, s12, s5 -; GFX10-NEXT: s_sub_i32 s13, s6, s13 +; GFX10-NEXT: v_pk_add_i16 v0, s0, s2 clamp +; GFX10-NEXT: v_pk_add_i16 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s13 -; GFX10-NEXT: s_sext_i32_i16 s13, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_gt_i32 s13, s14 -; GFX10-NEXT: s_cselect_b32 s13, s13, s14 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s8, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s13, s2 -; GFX10-NEXT: s_sext_i32_i16 s11, s5 -; GFX10-NEXT: s_sext_i32_i16 s8, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s11 -; GFX10-NEXT: s_cselect_b32 s8, s8, s11 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_add_i32 s5, s5, s8 -; GFX10-NEXT: s_ashr_i32 s8, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX10-NEXT: s_cselect_b32 s11, s2, s7 -; GFX10-NEXT: s_cmp_gt_i32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s13, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX10-NEXT: s_lshr_b32 s13, s11, 16 -; GFX10-NEXT: s_sub_i32 s9, s9, s11 -; GFX10-NEXT: s_sub_i32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s2, s7 -; GFX10-NEXT: s_cselect_b32 s2, s2, s7 -; GFX10-NEXT: s_cmp_lt_i32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, s12, s2 -; GFX10-NEXT: s_sub_i32 s4, s6, s4 -; GFX10-NEXT: s_sext_i32_i16 s6, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_cmp_gt_i32 s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s10 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX10-NEXT: s_ashr_i32 s4, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s6, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s3 -; GFX10-NEXT: s_cselect_b32 s3, s6, s3 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -5538,7 +3660,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14 @@ -5567,7 +3689,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_mov_b32_e32 v15, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 @@ -5619,7 +3741,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v11, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v11, s5, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 @@ -5646,7 +3768,7 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 ; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v13, s4, v13 -; GFX8-NEXT: v_mov_b32_e32 v12, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 @@ -5676,65 +3798,19 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v7, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v7, s5, v7 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v6, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX9-NEXT: v_pk_max_i16 v3, v7, v3 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v6 -; GFX9-NEXT: v_pk_min_i16 v6, v1, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, s5, v6 -; GFX9-NEXT: v_pk_sub_i16 v3, s4, v3 -; GFX9-NEXT: v_pk_max_i16 v4, v6, v4 -; GFX9-NEXT: v_pk_min_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_min_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s5, v4 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, s4, v3 -; GFX9-NEXT: v_pk_max_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_min_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 +; GFX9-NEXT: v_pk_add_i16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_add_i16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_add_i16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v6i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 0, 0 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 -; GFX10-NEXT: v_pk_min_i16 v7, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v8, v1, s5 -; GFX10-NEXT: v_pk_min_i16 v9, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_sub_i16 v14, s6, v7 -; GFX10-NEXT: v_pk_sub_i16 v15, s6, v8 -; GFX10-NEXT: v_pk_sub_i16 v19, s6, v9 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s5 -; GFX10-NEXT: v_pk_max_i16 v11, v2, s5 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v3, v14, v3 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX10-NEXT: v_pk_sub_i16 v7, s4, v10 -; GFX10-NEXT: v_pk_sub_i16 v8, s4, v11 -; GFX10-NEXT: v_pk_max_i16 v5, v19, v5 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v3 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v1, v4 clamp +; GFX10-NEXT: v_pk_add_i16 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_i16 v3, v3, v6 -; GFX10-NEXT: v_pk_min_i16 v4, v4, v7 -; GFX10-NEXT: v_pk_min_i16 v5, v5, v8 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX10-NEXT: v_pk_add_u16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x float> @@ -5751,7 +3827,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_cselect_b32 s14, s0, 0 ; GFX6-NEXT: s_sub_i32 s14, s12, s14 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s13, 0x80000000 +; GFX6-NEXT: s_brev_b32 s13, 1 ; GFX6-NEXT: s_cselect_b32 s15, s0, 0 ; GFX6-NEXT: s_sub_i32 s15, s13, s15 ; GFX6-NEXT: s_cmp_gt_i32 s15, s6 @@ -5860,7 +3936,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_cselect_b32 s16, s14, s15 ; GFX8-NEXT: s_sub_i32 s16, s12, s16 ; GFX8-NEXT: s_cmp_lt_i32 s14, s15 -; GFX8-NEXT: s_mov_b32 s13, 0x8000 +; GFX8-NEXT: s_movk_i32 s13, 0x8000 ; GFX8-NEXT: s_cselect_b32 s14, s14, s15 ; GFX8-NEXT: s_sub_i32 s14, s13, s14 ; GFX8-NEXT: s_sext_i32_i16 s14, s14 @@ -5968,287 +4044,26 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_saddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s11, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 -; GFX9-NEXT: s_ashr_i32 s8, s8, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s12, s9, s11 -; GFX9-NEXT: s_cmp_gt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_sub_i32 s12, s6, s12 -; GFX9-NEXT: s_sub_i32 s14, s13, s14 -; GFX9-NEXT: s_cmp_lt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s9, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s10, s8 -; GFX9-NEXT: s_mov_b32 s7, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s10, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_sub_i32 s14, s10, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX9-NEXT: s_sext_i32_i16 s14, s9 -; GFX9-NEXT: s_sext_i32_i16 s15, s3 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s14, s15 -; GFX9-NEXT: s_cselect_b32 s14, s14, s15 -; GFX9-NEXT: s_cmp_gt_i32 s9, s3 -; GFX9-NEXT: s_cselect_b32 s3, s9, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s14, s3 -; GFX9-NEXT: s_sext_i32_i16 s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s14, s12 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_cmp_lt_i32 s9, s14 -; GFX9-NEXT: s_cselect_b32 s9, s9, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s12 -; GFX9-NEXT: s_cselect_b32 s3, s3, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s12, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s9, s9, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_ashr_i32 s9, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s12, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s14, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_sub_i32 s12, s6, s12 -; GFX9-NEXT: s_sub_i32 s14, s13, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s9, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_sub_i32 s3, s7, s3 -; GFX9-NEXT: s_sub_i32 s9, s10, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_sext_i32_i16 s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s14, s4 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s14 -; GFX9-NEXT: s_cselect_b32 s9, s9, s14 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s9, s12 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s9 -; GFX9-NEXT: s_cselect_b32 s4, s4, s9 -; GFX9-NEXT: s_cmp_lt_i32 s3, s12 -; GFX9-NEXT: s_cselect_b32 s3, s3, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_ashr_i32 s4, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s9, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s12, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX9-NEXT: s_lshr_b32 s12, s9, 16 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_sub_i32 s9, s13, s12 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_sub_i32 s3, s7, s3 -; GFX9-NEXT: s_sub_i32 s4, s10, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s7, s5 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_gt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s9 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s7, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 -; GFX10-NEXT: s_ashr_i32 s8, s0, 16 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 -; GFX10-NEXT: s_movk_i32 s11, 0x7fff -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s11 -; GFX10-NEXT: s_cselect_b32 s12, s8, s6 -; GFX10-NEXT: s_mov_b32 s14, 0xffff8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX10-NEXT: s_lshr_b32 s12, s11, 16 -; GFX10-NEXT: s_lshr_b32 s13, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s11, s10 -; GFX10-NEXT: s_sub_i32 s13, s12, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s14 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_sext_i32_i16 s16, s3 -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_lshr_b32 s8, s14, 16 -; GFX10-NEXT: s_lshr_b32 s15, s7, 16 -; GFX10-NEXT: s_sub_i32 s7, s14, s7 -; GFX10-NEXT: s_sub_i32 s15, s8, s15 +; GFX10-NEXT: v_pk_add_i16 v0, s0, s3 clamp +; GFX10-NEXT: v_pk_add_i16 v1, s1, s4 clamp +; GFX10-NEXT: v_pk_add_i16 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s15 -; GFX10-NEXT: s_sext_i32_i16 s15, s7 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s3 -; GFX10-NEXT: s_sext_i32_i16 s16, s4 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s10, s13 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s15, s3 -; GFX10-NEXT: s_sext_i32_i16 s13, s7 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s13 -; GFX10-NEXT: s_cselect_b32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_lshr_b32 s10, s3, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_add_i32 s7, s7, s10 -; GFX10-NEXT: s_ashr_i32 s10, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_cselect_b32 s13, s3, s9 -; GFX10-NEXT: s_cmp_gt_i32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s15, s10, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s13, s11, s13 -; GFX10-NEXT: s_sub_i32 s15, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s3, s9 -; GFX10-NEXT: s_cselect_b32 s3, s3, s9 -; GFX10-NEXT: s_cmp_lt_i32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s10, s10, s6 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 -; GFX10-NEXT: s_lshr_b32 s10, s3, 16 -; GFX10-NEXT: s_sub_i32 s3, s14, s3 -; GFX10-NEXT: s_sub_i32 s10, s8, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s10, s16 -; GFX10-NEXT: s_cselect_b32 s10, s10, s16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s13, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_sext_i32_i16 s13, s4 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s13 -; GFX10-NEXT: s_cselect_b32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_lshr_b32 s10, s1, 16 -; GFX10-NEXT: s_lshr_b32 s13, s3, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s10, s10, s13 -; GFX10-NEXT: s_ashr_i32 s3, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s10 -; GFX10-NEXT: s_cselect_b32 s13, s4, s9 -; GFX10-NEXT: s_cmp_gt_i32 s3, s6 -; GFX10-NEXT: s_cselect_b32 s15, s3, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s11, s11, s13 -; GFX10-NEXT: s_sub_i32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s6 -; GFX10-NEXT: s_cselect_b32 s3, s3, s6 -; GFX10-NEXT: s_sext_i32_i16 s6, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 -; GFX10-NEXT: s_sub_i32 s3, s14, s3 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX10-NEXT: s_ashr_i32 s4, s5, 16 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s11, s12 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX10-NEXT: s_ashr_i32 s5, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s6, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_add_i32 s4, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -6260,7 +4075,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18 @@ -6289,7 +4104,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_mov_b32_e32 v19, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 @@ -6365,7 +4180,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 @@ -6392,7 +4207,7 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-NEXT: v_sub_u16_e32 v17, s5, v17 ; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v16, s4, v16 -; GFX8-NEXT: v_mov_b32_e32 v15, 0x8000 +; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 ; GFX8-NEXT: v_min_i16_e32 v17, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v17, v15, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 @@ -6438,79 +4253,21 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v9, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v9, s5, v9 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v8, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, s4, v8 -; GFX9-NEXT: v_pk_max_i16 v4, v9, v4 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v8 -; GFX9-NEXT: v_pk_min_i16 v8, v1, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, s5, v8 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 -; GFX9-NEXT: v_pk_max_i16 v5, v8, v5 -; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_min_i16 v5, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 -; GFX9-NEXT: v_pk_max_i16 v5, v5, v6 -; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_min_i16 v5, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 -; GFX9-NEXT: v_pk_max_i16 v5, v5, v7 -; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 +; GFX9-NEXT: v_pk_add_i16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_add_i16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_add_i16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_add_i16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX10-NEXT: v_pk_min_i16 v8, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v1, s4 -; GFX10-NEXT: v_pk_min_i16 v12, v3, s4 -; GFX10-NEXT: v_pk_max_i16 v9, v0, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, s5, v8 -; GFX10-NEXT: v_pk_min_i16 v8, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v11, s5, v11 -; GFX10-NEXT: v_pk_sub_i16 v12, s5, v12 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s4 -; GFX10-NEXT: v_pk_max_i16 v13, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v8, s5, v8 -; GFX10-NEXT: v_pk_max_i16 v14, v3, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v5, v11, v5 -; GFX10-NEXT: v_pk_sub_i16 v9, s6, v9 -; GFX10-NEXT: v_pk_sub_i16 v10, s6, v10 -; GFX10-NEXT: v_pk_max_i16 v6, v8, v6 -; GFX10-NEXT: v_pk_sub_i16 v11, s6, v13 -; GFX10-NEXT: v_pk_sub_i16 v8, s6, v14 -; GFX10-NEXT: v_pk_max_i16 v7, v12, v7 -; GFX10-NEXT: v_pk_min_i16 v15, v4, v9 -; GFX10-NEXT: v_pk_min_i16 v19, v5, v10 -; GFX10-NEXT: v_pk_min_i16 v11, v6, v11 +; GFX10-NEXT: v_pk_add_i16 v0, v0, v4 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v1, v5 clamp +; GFX10-NEXT: v_pk_add_i16 v2, v2, v6 clamp +; GFX10-NEXT: v_pk_add_i16 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_i16 v6, v7, v8 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v15 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v19 -; GFX10-NEXT: v_pk_add_u16 v2, v2, v11 -; GFX10-NEXT: v_pk_add_u16 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x float> @@ -6527,7 +4284,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_cselect_b32 s18, s0, 0 ; GFX6-NEXT: s_sub_i32 s18, s16, s18 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s17, 0x80000000 +; GFX6-NEXT: s_brev_b32 s17, 1 ; GFX6-NEXT: s_cselect_b32 s19, s0, 0 ; GFX6-NEXT: s_sub_i32 s19, s17, s19 ; GFX6-NEXT: s_cmp_gt_i32 s19, s8 @@ -6670,7 +4427,7 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_cselect_b32 s20, s18, s19 ; GFX8-NEXT: s_sub_i32 s20, s16, s20 ; GFX8-NEXT: s_cmp_lt_i32 s18, s19 -; GFX8-NEXT: s_mov_b32 s17, 0x8000 +; GFX8-NEXT: s_movk_i32 s17, 0x8000 ; GFX8-NEXT: s_cselect_b32 s18, s18, s19 ; GFX8-NEXT: s_sub_i32 s18, s17, s18 ; GFX8-NEXT: s_sext_i32_i16 s18, s18 @@ -6814,373 +4571,31 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_saddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s10, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s13, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_ashr_i32 s12, s0, 16 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s14, s11, s13 -; GFX9-NEXT: s_cmp_gt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_cselect_b32 s15, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s8 -; GFX9-NEXT: s_lshr_b32 s16, s14, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_sub_i32 s14, s8, s14 -; GFX9-NEXT: s_sub_i32 s16, s15, s16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s11, s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s12, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s12, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_lshr_b32 s16, s11, 16 -; GFX9-NEXT: s_lshr_b32 s12, s9, 16 -; GFX9-NEXT: s_sub_i32 s11, s9, s11 -; GFX9-NEXT: s_sub_i32 s16, s12, s16 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s16 -; GFX9-NEXT: s_sext_i32_i16 s16, s11 -; GFX9-NEXT: s_sext_i32_i16 s17, s4 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_cmp_gt_i32 s11, s4 -; GFX9-NEXT: s_cselect_b32 s4, s11, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s16, s14 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s16 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s14 -; GFX9-NEXT: s_cselect_b32 s4, s4, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX9-NEXT: s_lshr_b32 s11, s0, 16 -; GFX9-NEXT: s_lshr_b32 s14, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s11, s11, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s11 -; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s11, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s14, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s16, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_lshr_b32 s16, s14, 16 -; GFX9-NEXT: s_sub_i32 s14, s8, s14 -; GFX9-NEXT: s_sub_i32 s16, s15, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s11, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s11, s12, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s16, s5 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s16 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s11, s14 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s11 -; GFX9-NEXT: s_cselect_b32 s5, s5, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s14 -; GFX9-NEXT: s_cselect_b32 s4, s4, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s5, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s11, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s14, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s14 -; GFX9-NEXT: s_lshr_b32 s14, s11, 16 -; GFX9-NEXT: s_sub_i32 s11, s8, s11 -; GFX9-NEXT: s_sub_i32 s14, s15, s14 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s5, s12, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s14 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s14, s6 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s14 -; GFX9-NEXT: s_cselect_b32 s5, s5, s14 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s6, s11 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s11 -; GFX9-NEXT: s_cselect_b32 s4, s4, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s6, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s11, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 -; GFX9-NEXT: s_lshr_b32 s11, s6, 16 -; GFX9-NEXT: s_sub_i32 s6, s8, s6 -; GFX9-NEXT: s_sub_i32 s8, s15, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s5, s12, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s8, s7 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_cmp_gt_i32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s7, s6 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_add_i16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_saddsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 -; GFX10-NEXT: s_ashr_i32 s10, s0, 16 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 -; GFX10-NEXT: s_movk_i32 s13, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s13 -; GFX10-NEXT: s_cselect_b32 s14, s10, s8 -; GFX10-NEXT: s_mov_b32 s16, 0xffff8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX10-NEXT: s_lshr_b32 s14, s13, 16 -; GFX10-NEXT: s_lshr_b32 s15, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s13, s12 -; GFX10-NEXT: s_sub_i32 s15, s14, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s16, s16, s16 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_sext_i32_i16 s18, s4 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_lshr_b32 s10, s16, 16 -; GFX10-NEXT: s_lshr_b32 s17, s9, 16 -; GFX10-NEXT: s_sub_i32 s9, s16, s9 -; GFX10-NEXT: s_sub_i32 s17, s10, s17 +; GFX10-NEXT: v_pk_add_i16 v0, s0, s4 clamp +; GFX10-NEXT: v_pk_add_i16 v1, s1, s5 clamp +; GFX10-NEXT: v_pk_add_i16 v2, s2, s6 clamp +; GFX10-NEXT: v_pk_add_i16 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s17 -; GFX10-NEXT: s_sext_i32_i16 s17, s9 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 -; GFX10-NEXT: s_cmp_gt_i32 s9, s4 -; GFX10-NEXT: s_sext_i32_i16 s18, s5 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s12, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s17, s4 -; GFX10-NEXT: s_sext_i32_i16 s15, s9 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_cmp_lt_i32 s12, s15 -; GFX10-NEXT: s_cselect_b32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_add_i32 s9, s9, s12 -; GFX10-NEXT: s_ashr_i32 s12, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX10-NEXT: s_cselect_b32 s15, s4, s11 -; GFX10-NEXT: s_cmp_gt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s17, s12, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 -; GFX10-NEXT: s_lshr_b32 s17, s15, 16 -; GFX10-NEXT: s_sub_i32 s15, s13, s15 -; GFX10-NEXT: s_sub_i32 s17, s14, s17 -; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_cmp_lt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s12, s12, s8 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s16, s4 -; GFX10-NEXT: s_sub_i32 s12, s10, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s12, s18 -; GFX10-NEXT: s_cselect_b32 s12, s12, s18 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_sext_i32_i16 s18, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s15, s17 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_sext_i32_i16 s15, s5 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s12, s15 -; GFX10-NEXT: s_cselect_b32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_add_i32 s5, s5, s12 -; GFX10-NEXT: s_ashr_i32 s12, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s15, s4, s11 -; GFX10-NEXT: s_cmp_gt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s17, s12, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 -; GFX10-NEXT: s_lshr_b32 s17, s15, 16 -; GFX10-NEXT: s_sub_i32 s15, s13, s15 -; GFX10-NEXT: s_sub_i32 s17, s14, s17 -; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_cmp_lt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s12, s12, s8 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s16, s4 -; GFX10-NEXT: s_sub_i32 s12, s10, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s12, s18 -; GFX10-NEXT: s_cselect_b32 s12, s12, s18 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s6, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_ashr_i32 s12, s15, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s15, s6 -; GFX10-NEXT: s_cselect_b32 s6, s15, s6 -; GFX10-NEXT: s_cmp_lt_i32 s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s15, s3 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s4 -; GFX10-NEXT: s_ashr_i32 s6, s3, 16 -; GFX10-NEXT: s_lshr_b32 s17, s4, 16 -; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_add_i32 s12, s12, s17 -; GFX10-NEXT: s_cmp_gt_i32 s15, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s12 -; GFX10-NEXT: s_cselect_b32 s4, s15, s11 -; GFX10-NEXT: s_cmp_gt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s17, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s17 -; GFX10-NEXT: s_lshr_b32 s17, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s13, s4 -; GFX10-NEXT: s_sub_i32 s13, s14, s17 -; GFX10-NEXT: s_cmp_lt_i32 s15, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s13 -; GFX10-NEXT: s_cselect_b32 s11, s15, s11 -; GFX10-NEXT: s_cmp_lt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s11, s6 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_sub_i32 s6, s16, s6 -; GFX10-NEXT: s_sub_i32 s8, s10, s8 -; GFX10-NEXT: s_sext_i32_i16 s10, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_sext_i32_i16 s8, s6 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 -; GFX10-NEXT: s_cmp_gt_i32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s8, s6 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_sext_i32_i16 s8, s6 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s7 -; GFX10-NEXT: s_cselect_b32 s7, s8, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index f68465faf61cd..57737aeb886fa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -399,7 +399,7 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; CGP-NEXT: v_mov_b32_e32 v3, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0xfffff000 +; CGP-NEXT: s_movk_i32 s5, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 1813c33019ae8..f6565fe1b6e24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -200,13 +200,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_sdiv_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[0:1], 0 +; CHECK-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 -; CHECK-NEXT: s_mov_b32 s0, 1 ; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 @@ -354,13 +353,13 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: BB1_2: ; %Flow -; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 BB1_4 ; CHECK-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index ed1fe7af5f365..015f6b5de8b04 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -529,8 +529,7 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_shl_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s4, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 43f79f4b207d0..320d814be8a94 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -369,7 +369,7 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; CGP-NEXT: v_mov_b32_e32 v3, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0xfffff000 +; CGP-NEXT: s_movk_i32 s5, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 438388ebf7136..06d46321a59b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -196,13 +196,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_srem_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[0:1], 0 +; CHECK-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 -; CHECK-NEXT: s_mov_b32 s0, 1 ; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 @@ -352,9 +351,9 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 -; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: BB1_2: ; %Flow -; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 BB1_4 ; CHECK-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index a7154131e3c07..f9e4ccd03955e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -24,9 +24,8 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX8-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 @@ -40,15 +39,8 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX9-NEXT: v_min_i16_e32 v3, s4, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_max_i16_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -57,16 +49,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v2, v0, s4 -; GFX10-NEXT: v_min_i16_e64 v3, v0, s4 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs) @@ -98,13 +83,13 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s4, -1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s5, s3, s4 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff ; GFX8-NEXT: s_cmp_lt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_sext_i32_i16 s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s4, s1 @@ -121,54 +106,23 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-LABEL: s_ssubsat_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, 0xffff -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s5, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x8000 -; GFX9-NEXT: s_sext_i32_i16 s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s3, s4 -; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s5 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x8000 -; GFX10-NEXT: s_cmp_gt_i32 s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -194,9 +148,8 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX8-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 @@ -210,15 +163,8 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX9-NEXT: v_min_i16_e32 v3, s4, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_max_i16_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -227,16 +173,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v2, v0, s4 -; GFX10-NEXT: v_min_i16_e64 v3, v0, s4 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) @@ -268,13 +207,13 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s4, -1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s5, s3, s4 ; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff ; GFX8-NEXT: s_cmp_lt_i32 s3, s4 ; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_sext_i32_i16 s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s4, s1 @@ -291,54 +230,23 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-LABEL: s_ssubsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s0 -; GFX9-NEXT: s_sext_i32_i16 s4, 0xffff -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s5, s3, s4 -; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x8000 -; GFX9-NEXT: s_sext_i32_i16 s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_sext_i32_i16 s4, 0xffff +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s3, s4 -; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s5 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x8000 -; GFX10-NEXT: s_cmp_gt_i32 s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -355,7 +263,7 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 @@ -385,21 +293,20 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v4, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v5, s6, v0 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 ; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_max_i16_e32 v1, s6, v3 +; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX8-NEXT: v_min_i16_e32 v4, s6, v3 +; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 @@ -415,27 +322,12 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v4, s6, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u16_e32 v4, s4, v4 -; GFX9-NEXT: s_mov_b32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v5, s6, v0 -; GFX9-NEXT: v_max_i16_e32 v1, v4, v1 -; GFX9-NEXT: v_subrev_u16_e32 v5, s5, v5 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v5 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, s6, v2 -; GFX9-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_min_i16_e32 v4, s6, v2 -; GFX9-NEXT: v_subrev_u16_e32 v4, s5, v4 -; GFX9-NEXT: v_max_i16_e32 v1, v1, v3 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v4 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_sub_i16 v1, v2, v3 clamp ; GFX9-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -446,32 +338,17 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s6, 0xffff -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_i16_e64 v4, v2, s6 -; GFX10-NEXT: v_max_i16_e64 v5, v0, s6 -; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_min_i16_e64 v6, v2, s6 -; GFX10-NEXT: v_min_i16_e64 v7, v0, s6 -; GFX10-NEXT: v_sub_nc_u16_e64 v4, v4, s5 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, v5, s5 -; GFX10-NEXT: s_mov_b32 s4, 0x8000 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v6, v6, s4 -; GFX10-NEXT: v_max_i16_e64 v1, v4, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v7, v7, s4 -; GFX10-NEXT: v_max_i16_e64 v10, v5, v3 +; GFX10-NEXT: v_lshlrev_b16_e64 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_min_i16_e64 v1, v1, v6 -; GFX10-NEXT: v_min_i16_e64 v3, v10, v7 -; GFX10-NEXT: v_sub_nc_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v3 -; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_sub_nc_i16 v1, v2, v1 clamp +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v3 clamp +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -492,7 +369,7 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_cselect_b32 s6, s0, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_cmp_gt_i32 s6, s1 @@ -530,13 +407,13 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s7, s0 -; GFX8-NEXT: s_sext_i32_i16 s8, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s8, -1 ; GFX8-NEXT: s_cmp_gt_i32 s7, s8 ; GFX8-NEXT: s_movk_i32 s5, 0x7fff ; GFX8-NEXT: s_cselect_b32 s9, s7, s8 ; GFX8-NEXT: s_sub_i32 s9, s9, s5 ; GFX8-NEXT: s_cmp_lt_i32 s7, s8 -; GFX8-NEXT: s_mov_b32 s6, 0x8000 +; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: s_cselect_b32 s7, s7, s8 ; GFX8-NEXT: s_sub_i32 s7, s7, s6 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 @@ -580,112 +457,40 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-LABEL: s_ssubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_sext_i32_i16 s8, 0xffff -; GFX9-NEXT: s_cmp_gt_i32 s7, s8 -; GFX9-NEXT: s_movk_i32 s5, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s7, s8 -; GFX9-NEXT: s_sub_i32 s9, s9, s5 -; GFX9-NEXT: s_cmp_lt_i32 s7, s8 -; GFX9-NEXT: s_mov_b32 s6, 0x8000 -; GFX9-NEXT: s_cselect_b32 s7, s7, s8 -; GFX9-NEXT: s_sub_i32 s7, s7, s6 -; GFX9-NEXT: s_sext_i32_i16 s9, s9 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s9, s1 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s7, s7 -; GFX9-NEXT: s_cmp_lt_i32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s1, s1, s7 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s2, s3, s4 -; GFX9-NEXT: s_ashr_i32 s0, s0, s4 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s7, s3, s8 -; GFX9-NEXT: s_sub_i32 s5, s7, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_gt_i32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_ashr_i32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: s_lshl_b32 s0, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_sub_i16 v1, s0, v1 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v0, sext(v0), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s6, 0xffff -; GFX10-NEXT: s_sext_i32_i16 s5, s0 +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_bfe_u32 s3, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s6 -; GFX10-NEXT: s_mov_b32 s9, 0x8000 -; GFX10-NEXT: s_sub_i32 s8, s8, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s8, s8 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_sub_i32 s5, s5, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s1 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s1, s8, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp +; GFX10-NEXT: v_sub_nc_i16 v1, s2, s3 clamp +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s5 -; GFX10-NEXT: s_cselect_b32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s4, s2 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s3 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cselect_b32 s5, s4, s6 -; GFX10-NEXT: s_sub_i32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_sub_i32 s4, s4, s9 -; GFX10-NEXT: s_cmp_gt_i32 s5, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: s_cselect_b32 s1, s5, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s4 -; GFX10-NEXT: s_cselect_b32 s1, s1, s4 -; GFX10-NEXT: s_sub_i32 s1, s3, s1 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_and_b32 s0, s0, s3 -; GFX10-NEXT: s_ashr_i32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -709,7 +514,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 @@ -736,7 +541,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: s_movk_i32 s4, 0xff @@ -769,43 +574,41 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v9, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v11, s6, v0 -; GFX8-NEXT: v_max_i16_e32 v1, v9, v1 -; GFX8-NEXT: v_subrev_u16_e32 v11, s5, v11 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v11 +; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 +; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 +; GFX8-NEXT: v_subrev_u16_e32 v10, s5, v10 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v10 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_max_i16_e32 v1, s6, v3 +; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX8-NEXT: v_min_i16_e32 v9, s6, v3 +; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v9, s5, v9 -; GFX8-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v9 -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v4, v2, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_min_i16_e32 v6, v2, v8 -; GFX8-NEXT: v_sub_u16_e32 v4, v4, v10 +; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 +; GFX8-NEXT: v_sub_u16_e32 v4, v4, v9 ; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_subrev_u16_e32 v6, s5, v6 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 -; GFX8-NEXT: v_max_i16_e32 v5, v3, v8 -; GFX8-NEXT: v_min_i16_e32 v6, v3, v8 +; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 +; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_sub_u16_e32 v5, v5, v10 +; GFX8-NEXT: v_sub_u16_e32 v5, v5, v9 ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v6 @@ -826,54 +629,25 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s6, 0xffff -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v9, s6, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u16_e32 v9, s4, v9 -; GFX9-NEXT: s_mov_b32 s5, 0x8000 -; GFX9-NEXT: v_min_i16_e32 v11, s6, v0 -; GFX9-NEXT: v_max_i16_e32 v1, v9, v1 -; GFX9-NEXT: v_subrev_u16_e32 v11, s5, v11 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v11 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v1, s6, v2 -; GFX9-NEXT: v_subrev_u16_e32 v1, s4, v1 -; GFX9-NEXT: v_min_i16_e32 v9, s6, v2 -; GFX9-NEXT: v_subrev_u16_e32 v9, s5, v9 -; GFX9-NEXT: v_max_i16_e32 v1, v1, v5 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v9 -; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp +; GFX9-NEXT: v_sub_i16 v1, v2, v5 clamp ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX9-NEXT: v_max_i16_e32 v5, v2, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_min_i16_e32 v6, v2, v8 -; GFX9-NEXT: v_sub_u16_e32 v5, v5, v10 -; GFX9-NEXT: v_subrev_u16_e32 v6, s5, v6 -; GFX9-NEXT: v_max_i16_e32 v3, v5, v3 -; GFX9-NEXT: v_min_i16_e32 v3, v3, v6 -; GFX9-NEXT: v_sub_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_max_i16_e32 v5, v3, v8 -; GFX9-NEXT: v_min_i16_e32 v6, v3, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX9-NEXT: v_sub_u16_e32 v5, v5, v10 ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 -; GFX9-NEXT: v_max_i16_e32 v4, v5, v4 ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_min_i16_e32 v4, v4, v6 +; GFX9-NEXT: v_sub_i16 v2, v2, v3 clamp +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_i16 v3, v3, v4 clamp ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: v_and_b32_sdwa v1, sext(v2), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -885,58 +659,29 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_mov_b32 s6, 24 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v19, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_max_i16_e64 v8, v0, s4 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_max_i16_e64 v9, v2, s4 -; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_min_i16_e64 v10, v0, s4 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v8, s5 -; GFX10-NEXT: s_mov_b32 s6, 0x8000 -; GFX10-NEXT: v_sub_nc_u16_e64 v15, v9, s5 -; GFX10-NEXT: v_min_i16_e64 v11, v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xffff -; GFX10-NEXT: v_max_i16_e64 v1, v8, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v10, v10, s6 -; GFX10-NEXT: v_max_i16_e64 v5, v15, v5 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v11, s6 -; GFX10-NEXT: v_mov_b32_e32 v9, 0x7fff -; GFX10-NEXT: v_max_i16_e64 v11, v3, v12 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v10 -; GFX10-NEXT: v_max_i16_e64 v10, v19, v12 -; GFX10-NEXT: v_min_i16_e64 v5, v5, v8 -; GFX10-NEXT: v_min_i16_e64 v8, v3, v12 -; GFX10-NEXT: v_sub_nc_u16_e64 v11, v11, v9 -; GFX10-NEXT: v_min_i16_e64 v12, v19, v12 -; GFX10-NEXT: v_sub_nc_u16_e64 v9, v10, v9 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, v5 -; GFX10-NEXT: v_sub_nc_u16_e64 v5, v8, s6 -; GFX10-NEXT: v_max_i16_e64 v6, v11, v6 -; GFX10-NEXT: v_sub_nc_u16_e64 v8, v12, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v7, v9, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, sext(v2), s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_min_i16_e64 v2, v6, v5 -; GFX10-NEXT: v_min_i16_e64 v5, v7, v8 +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v6, 8, v1 +; GFX10-NEXT: s_mov_b32 s5, 16 +; GFX10-NEXT: s_mov_b32 s4, 24 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_i16 v2, v2, v3 clamp +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v2), s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_i16 v5, v5, v6 clamp +; GFX10-NEXT: v_sub_nc_i16 v3, v4, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v3, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v19, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, sext(v2), s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX10-NEXT: v_ashrrev_i16_e64 v4, 8, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v3), s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, sext(v0), s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v4, s5, v2 +; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -961,7 +706,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_cselect_b32 s10, s0, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, -1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_cmp_gt_i32 s10, s1 @@ -1037,13 +782,13 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 ; GFX8-NEXT: s_sext_i32_i16 s11, s0 -; GFX8-NEXT: s_sext_i32_i16 s12, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s12, -1 ; GFX8-NEXT: s_cmp_gt_i32 s11, s12 ; GFX8-NEXT: s_movk_i32 s9, 0x7fff ; GFX8-NEXT: s_cselect_b32 s13, s11, s12 ; GFX8-NEXT: s_sub_i32 s13, s13, s9 ; GFX8-NEXT: s_cmp_lt_i32 s11, s12 -; GFX8-NEXT: s_mov_b32 s10, 0x8000 +; GFX8-NEXT: s_movk_i32 s10, 0x8000 ; GFX8-NEXT: s_cselect_b32 s11, s11, s12 ; GFX8-NEXT: s_sub_i32 s11, s11, s10 ; GFX8-NEXT: s_sext_i32_i16 s13, s13 @@ -1133,212 +878,70 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-LABEL: s_ssubsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_sext_i32_i16 s12, 0xffff -; GFX9-NEXT: s_cmp_gt_i32 s11, s12 -; GFX9-NEXT: s_movk_i32 s9, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s11, s12 -; GFX9-NEXT: s_sub_i32 s13, s13, s9 -; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_mov_b32 s10, 0x8000 -; GFX9-NEXT: s_cselect_b32 s11, s11, s12 -; GFX9-NEXT: s_sub_i32 s11, s11, s10 -; GFX9-NEXT: s_sext_i32_i16 s13, s13 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s13, s1 -; GFX9-NEXT: s_cselect_b32 s1, s13, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s11, s11 -; GFX9-NEXT: s_cmp_lt_i32 s1, s11 -; GFX9-NEXT: s_cselect_b32 s1, s1, s11 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s2, s8 -; GFX9-NEXT: s_lshl_b32 s2, s5, s8 -; GFX9-NEXT: s_ashr_i32 s0, s0, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s11, s5, s12 -; GFX9-NEXT: s_sub_i32 s11, s11, s9 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s11 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_gt_i32 s11, s2 -; GFX9-NEXT: s_cselect_b32 s2, s11, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_cmp_lt_i32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_lshl_b32 s2, s3, s8 -; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_ashr_i32 s1, s1, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s6, s5, s12 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_sext_i32_i16 s6, s6 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_cmp_gt_i32 s6, s3 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_lshl_b32 s3, s4, s8 -; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_ashr_i32 s2, s2, s8 -; GFX9-NEXT: s_sext_i32_i16 s5, s3 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s6, s5, s12 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_sext_i32_i16 s6, s6 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: s_cmp_gt_i32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s5 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, s8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s2, s8 +; GFX9-NEXT: s_lshl_b32 s1, s6, s8 +; GFX9-NEXT: v_sub_i16 v1, s0, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s3, s8 +; GFX9-NEXT: s_lshl_b32 s1, s7, s8 +; GFX9-NEXT: v_sub_i16 v2, s0, v2 clamp +; GFX9-NEXT: s_lshl_b32 s0, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_sub_i16 v3, s0, v3 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v1), s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, sext(v2), s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, sext(v3), s0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s6, 8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_sub_nc_i16 v1, s2, s6 clamp ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, 0xffff -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s6 -; GFX10-NEXT: s_cmp_gt_i32 s9, s10 -; GFX10-NEXT: s_movk_i32 s11, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s10 -; GFX10-NEXT: s_mov_b32 s13, 0x8000 -; GFX10-NEXT: s_sub_i32 s12, s12, s11 -; GFX10-NEXT: s_cmp_lt_i32 s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s12, s12 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_sub_i32 s9, s9, s13 -; GFX10-NEXT: s_cmp_gt_i32 s12, s1 -; GFX10-NEXT: s_sext_i32_i16 s9, s9 -; GFX10-NEXT: s_cselect_b32 s1, s12, s1 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s7, s1, s5 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v1), s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s7 clamp +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_sub_nc_i16 v2, s3, s0 clamp +; GFX10-NEXT: v_sub_nc_i16 v3, s4, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s9 -; GFX10-NEXT: s_cselect_b32 s1, s1, s9 -; GFX10-NEXT: s_lshl_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s6 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, s6 -; GFX10-NEXT: s_cmp_gt_i32 s2, s10 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s9, s2, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s2, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s9 -; GFX10-NEXT: s_cselect_b32 s2, s2, s10 -; GFX10-NEXT: s_sub_i32 s2, s2, s13 -; GFX10-NEXT: s_cmp_gt_i32 s9, s5 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s5, s9, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cmp_lt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_sub_i32 s1, s1, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_lshl_b32 s2, s7, s6 -; GFX10-NEXT: s_ashr_i32 s1, s1, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s7, s5, s10 -; GFX10-NEXT: s_sub_i32 s7, s7, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s7, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s10 -; GFX10-NEXT: s_sub_i32 s5, s5, s13 -; GFX10-NEXT: s_cmp_gt_i32 s7, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s2, s7, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s4, s4, s6 -; GFX10-NEXT: s_sub_i32 s2, s3, s2 -; GFX10-NEXT: s_sext_i32_i16 s5, s4 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_lshl_b32 s3, s8, s6 -; GFX10-NEXT: s_ashr_i32 s2, s2, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cselect_b32 s7, s5, s10 -; GFX10-NEXT: s_sub_i32 s7, s7, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s10 -; GFX10-NEXT: s_sext_i32_i16 s7, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s10 -; GFX10-NEXT: s_sub_i32 s5, s5, s13 -; GFX10-NEXT: s_cmp_gt_i32 s7, s3 -; GFX10-NEXT: s_sext_i32_i16 s5, s5 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_movk_i32 s7, 0xff -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_and_b32 s1, s1, s7 -; GFX10-NEXT: s_sub_i32 s3, s4, s3 -; GFX10-NEXT: s_and_b32 s0, s0, s7 -; GFX10-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, s7 -; GFX10-NEXT: s_ashr_i32 s3, s3, s6 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s7 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, sext(v2), s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, sext(v3), s2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -1383,14 +986,8 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 0x7fffffff, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1401,13 +998,7 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX10-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 0x7fffffff, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 0x80000000, v3 -; GFX10-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v1 clamp ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs) @@ -1454,39 +1045,22 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; ; GFX9-LABEL: s_ssubsat_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s2, s0, -1 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x7fffffff -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s3, s0, -1 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x80000000 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, -1 -; GFX10-NEXT: s_sub_i32 s2, s2, 0x7fffffff -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 -; GFX10-NEXT: s_cselect_b32 s3, s0, -1 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x80000000 -; GFX10-NEXT: s_cmp_gt_i32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, 8 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s1 clamp +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -1520,27 +1094,15 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX9-LABEL: v_ssubsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, 0x7fffffff, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, 0x80000000, v3 -; GFX9-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX10-NEXT: v_min_i32_e32 v3, -1, v0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 0x7fffffff, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 0x80000000, v3 -; GFX10-NEXT: v_max_i32_e32 v1, v2, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1579,33 +1141,16 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; ; GFX9-LABEL: s_ssubsat_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s2, s0, -1 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x7fffffff -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s3, s0, -1 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x80000000 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, -1 -; GFX10-NEXT: s_sub_i32 s2, s2, 0x7fffffff -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 -; GFX10-NEXT: s_cselect_b32 s3, s0, -1 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x80000000 -; GFX10-NEXT: s_cmp_gt_i32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1640,29 +1185,13 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; ; GFX9-LABEL: ssubsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s1, s0, -1 -; GFX9-NEXT: s_sub_i32 s1, s1, 0x7fffffff -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_cselect_b32 s2, s0, -1 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x80000000 -; GFX9-NEXT: v_max_i32_e32 v0, s1, v0 -; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i32_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s1, s0, -1 -; GFX10-NEXT: s_sub_i32 s1, s1, 0x7fffffff -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 -; GFX10-NEXT: v_max_i32_e32 v0, s1, v0 -; GFX10-NEXT: s_cselect_b32 s1, s0, -1 -; GFX10-NEXT: s_sub_i32 s1, s1, 0x80000000 -; GFX10-NEXT: v_min_i32_e32 v0, s1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1694,25 +1223,13 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX9-LABEL: ssubsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, 0x80000000, v2 -; GFX9-NEXT: v_max_i32_e32 v1, s0, v1 -; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i32 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i32_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX10-NEXT: v_min_i32_e32 v2, -1, v0 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_max_i32_e32 v1, s0, v1 -; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1726,7 +1243,7 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 @@ -1747,7 +1264,7 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 @@ -1765,45 +1282,17 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v4 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX9-NEXT: v_max_i32_e32 v2, v4, v2 -; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v5 -; GFX9-NEXT: v_min_i32_e32 v2, v2, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_max_i32_e32 v2, v2, v3 -; GFX9-NEXT: v_min_i32_e32 v2, v2, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX10-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_min_i32_e32 v6, -1, v0 -; GFX10-NEXT: v_min_i32_e32 v7, -1, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s4, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s4, v5 -; GFX10-NEXT: s_mov_b32 s4, 0x80000000 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v6 -; GFX10-NEXT: v_max_i32_e32 v11, v4, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s4, v7 -; GFX10-NEXT: v_max_i32_e32 v10, v5, v3 -; GFX10-NEXT: v_min_i32_e32 v2, v11, v6 -; GFX10-NEXT: v_min_i32_e32 v3, v10, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1817,7 +1306,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX6-NEXT: s_cselect_b32 s6, s0, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_cmp_gt_i32 s6, s2 @@ -1845,7 +1334,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX8-NEXT: s_cselect_b32 s6, s0, -1 ; GFX8-NEXT: s_sub_i32 s6, s6, s4 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_cselect_b32 s7, s0, -1 ; GFX8-NEXT: s_sub_i32 s7, s7, s5 ; GFX8-NEXT: s_cmp_gt_i32 s6, s2 @@ -1868,59 +1357,21 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX9-LABEL: s_ssubsat_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: s_cselect_b32 s6, s0, -1 -; GFX9-NEXT: s_sub_i32 s6, s6, s4 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s7, s0, -1 -; GFX9-NEXT: s_sub_i32 s7, s7, s5 -; GFX9-NEXT: s_cmp_gt_i32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_cmp_lt_i32 s2, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s7 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s2, s1, -1 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s4, s1, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_cmp_lt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: s_cselect_b32 s5, s0, -1 -; GFX10-NEXT: s_mov_b32 s6, 0x80000000 -; GFX10-NEXT: s_sub_i32 s5, s5, s4 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s2 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s0, -1 -; GFX10-NEXT: s_sub_i32 s7, s7, s6 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_cmp_lt_i32 s2, s7 -; GFX10-NEXT: s_cselect_b32 s2, s2, s7 -; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s2, s1, -1 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s4, s1, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s6 -; GFX10-NEXT: s_cmp_gt_i32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_sub_i32 s1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1933,7 +1384,7 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s4, v6 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s5, v7 @@ -1961,7 +1412,7 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v7, -1, v0 ; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s5, v7 @@ -1986,59 +1437,19 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v6 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX9-NEXT: v_max_i32_e32 v3, v6, v3 -; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 -; GFX9-NEXT: v_min_i32_e32 v3, v3, v7 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX9-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_min_i32_e32 v3, v3, v6 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_max_i32_e32 v3, v3, v5 -; GFX9-NEXT: v_min_i32_e32 v3, v3, v4 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_sub_i32 v0, v0, v3 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v4 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX10-NEXT: v_max_i32_e32 v8, -1, v1 -; GFX10-NEXT: v_max_i32_e32 v9, -1, v2 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v15, s4, v8 -; GFX10-NEXT: v_subrev_nc_u32_e32 v19, s4, v9 -; GFX10-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX10-NEXT: v_min_i32_e32 v11, -1, v2 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 -; GFX10-NEXT: v_max_i32_e32 v14, v6, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s5, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s5, v10 -; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s5, v11 -; GFX10-NEXT: v_max_i32_e32 v5, v19, v5 -; GFX10-NEXT: v_min_i32_e32 v3, v14, v7 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v3 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v4 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX10-NEXT: v_min_i32_e32 v5, v5, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -2052,7 +1463,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX6-NEXT: s_cselect_b32 s8, s0, -1 ; GFX6-NEXT: s_sub_i32 s8, s8, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s7, 0x80000000 +; GFX6-NEXT: s_brev_b32 s7, 1 ; GFX6-NEXT: s_cselect_b32 s9, s0, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s7 ; GFX6-NEXT: s_cmp_gt_i32 s8, s3 @@ -2091,7 +1502,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX8-NEXT: s_cselect_b32 s8, s0, -1 ; GFX8-NEXT: s_sub_i32 s8, s8, s6 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s7, 0x80000000 +; GFX8-NEXT: s_brev_b32 s7, 1 ; GFX8-NEXT: s_cselect_b32 s9, s0, -1 ; GFX8-NEXT: s_sub_i32 s9, s9, s7 ; GFX8-NEXT: s_cmp_gt_i32 s8, s3 @@ -2125,81 +1536,26 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX9-LABEL: s_ssubsat_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s6, -2 -; GFX9-NEXT: s_cselect_b32 s8, s0, -1 -; GFX9-NEXT: s_sub_i32 s8, s8, s6 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s7, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s9, s0, -1 -; GFX9-NEXT: s_sub_i32 s9, s9, s7 -; GFX9-NEXT: s_cmp_gt_i32 s8, s3 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_cmp_lt_i32 s3, s9 -; GFX9-NEXT: s_cselect_b32 s3, s3, s9 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s3, s1, -1 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s8, s1, -1 -; GFX9-NEXT: s_sub_i32 s8, s8, s7 -; GFX9-NEXT: s_cmp_gt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_i32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s3, s3, s8 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s3, s2, -1 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s4, s2, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s7 -; GFX9-NEXT: s_cmp_gt_i32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s6, -2 -; GFX10-NEXT: s_cselect_b32 s7, s0, -1 -; GFX10-NEXT: s_mov_b32 s8, 0x80000000 -; GFX10-NEXT: s_sub_i32 s7, s7, s6 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s3 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s4 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s0, -1 -; GFX10-NEXT: s_sub_i32 s9, s9, s8 -; GFX10-NEXT: s_cmp_gt_i32 s7, s3 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_cmp_lt_i32 s3, s9 -; GFX10-NEXT: s_cselect_b32 s3, s3, s9 -; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s3, s1, -1 -; GFX10-NEXT: s_sub_i32 s3, s3, s6 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s7, s1, -1 -; GFX10-NEXT: s_sub_i32 s7, s7, s8 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_cmp_gt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s3, s2, -1 -; GFX10-NEXT: s_sub_i32 s3, s3, s6 -; GFX10-NEXT: s_cmp_lt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s4, s2, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s8 -; GFX10-NEXT: s_cmp_gt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -2212,7 +1568,7 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v9, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s5, v9 @@ -2247,7 +1603,7 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s4, v8 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v9, -1, v0 ; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s5, v9 @@ -2279,73 +1635,21 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v8, s4, v8 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX9-NEXT: v_max_i32_e32 v4, v8, v4 -; GFX9-NEXT: v_subrev_u32_e32 v9, s5, v9 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v9 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX9-NEXT: v_max_i32_e32 v4, v4, v5 -; GFX9-NEXT: v_subrev_u32_e32 v8, s5, v8 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v5 -; GFX9-NEXT: v_max_i32_e32 v4, v4, v6 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, 0x7fffffff, v4 -; GFX9-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, 0x80000000, v5 -; GFX9-NEXT: v_max_i32_e32 v4, v4, v7 -; GFX9-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_i32 v0, v0, v4 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v5 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v6 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_max_i32_e32 v10, -1, v1 -; GFX10-NEXT: v_max_i32_e32 v12, -1, v3 -; GFX10-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v15, s4, v8 -; GFX10-NEXT: v_max_i32_e32 v8, -1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s4, v10 -; GFX10-NEXT: v_subrev_nc_u32_e32 v12, 0x7fffffff, v12 -; GFX10-NEXT: v_min_i32_e32 v11, -1, v1 -; GFX10-NEXT: v_min_i32_e32 v13, -1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s4, v8 -; GFX10-NEXT: v_min_i32_e32 v14, -1, v3 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 -; GFX10-NEXT: v_max_i32_e32 v4, v15, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s5, v9 -; GFX10-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s5, v11 -; GFX10-NEXT: v_max_i32_e32 v15, v8, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s5, v13 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 0x80000000, v14 -; GFX10-NEXT: v_max_i32_e32 v7, v12, v7 -; GFX10-NEXT: v_min_i32_e32 v19, v4, v9 -; GFX10-NEXT: v_min_i32_e32 v11, v5, v11 -; GFX10-NEXT: v_min_i32_e32 v15, v15, v10 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v4 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v5 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v6 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_i32_e32 v6, v7, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -2359,7 +1663,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX6-NEXT: s_cselect_b32 s10, s0, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, -1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_cmp_gt_i32 s10, s4 @@ -2409,7 +1713,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX8-NEXT: s_cselect_b32 s10, s0, -1 ; GFX8-NEXT: s_sub_i32 s10, s10, s8 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s9, 0x80000000 +; GFX8-NEXT: s_brev_b32 s9, 1 ; GFX8-NEXT: s_cselect_b32 s11, s0, -1 ; GFX8-NEXT: s_sub_i32 s11, s11, s9 ; GFX8-NEXT: s_cmp_gt_i32 s10, s4 @@ -2454,103 +1758,31 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX9-LABEL: s_ssubsat_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s8, -2 -; GFX9-NEXT: s_cselect_b32 s10, s0, -1 -; GFX9-NEXT: s_sub_i32 s10, s10, s8 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s9, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s11, s0, -1 -; GFX9-NEXT: s_sub_i32 s11, s11, s9 -; GFX9-NEXT: s_cmp_gt_i32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_cmp_lt_i32 s4, s11 -; GFX9-NEXT: s_cselect_b32 s4, s4, s11 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s4, s1, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s10, s1, -1 -; GFX9-NEXT: s_sub_i32 s10, s10, s9 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_i32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s4, s2, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s5, s2, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s9 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_cmp_gt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s4, s3, -1 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_cmp_lt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s5, s3, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s9 -; GFX9-NEXT: s_cmp_gt_i32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s8, -2 -; GFX10-NEXT: s_cselect_b32 s9, s0, -1 -; GFX10-NEXT: s_mov_b32 s10, 0x80000000 -; GFX10-NEXT: s_sub_i32 s9, s9, s8 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s4 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s5 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s6 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s11, s0, -1 -; GFX10-NEXT: s_sub_i32 s11, s11, s10 -; GFX10-NEXT: s_cmp_gt_i32 s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 -; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s4, s1, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s8 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s9, s1, -1 -; GFX10-NEXT: s_sub_i32 s9, s9, s10 -; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_cmp_gt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s4, s2, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s5, s2, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s4, s3, -1 -; GFX10-NEXT: s_sub_i32 s4, s4, s8 -; GFX10-NEXT: s_cmp_lt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s5, s3, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_gt_i32 s4, s7 -; GFX10-NEXT: s_cselect_b32 s4, s4, s7 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_sub_i32 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -2563,7 +1795,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s4, v10 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 @@ -2586,7 +1818,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 -; GFX6-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 @@ -2607,7 +1839,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s4, v10 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, s5, v12 @@ -2630,7 +1862,7 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x80000000 +; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 @@ -2648,90 +1880,22 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v10, s4, v10 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX9-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX9-NEXT: v_subrev_u32_e32 v12, s5, v12 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v12 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v10, s5, v10 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v7 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_bfrev_b32_e32 v11, -2 -; GFX9-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v13 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v8 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v11 -; GFX9-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v13 -; GFX9-NEXT: v_max_i32_e32 v5, v5, v9 -; GFX9-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_sub_i32 v0, v0, v5 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v6 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v7 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v8 clamp +; GFX9-NEXT: v_sub_i32 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v5i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX10-NEXT: v_max_i32_e32 v13, -1, v1 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_bfrev_b32_e32 v11, -2 -; GFX10-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s4, v10 -; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s4, v13 -; GFX10-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, 0x80000000 -; GFX10-NEXT: v_min_i32_e32 v15, -1, v1 -; GFX10-NEXT: v_max_i32_e32 v5, v10, v5 -; GFX10-NEXT: v_max_i32_e32 v10, -1, v2 -; GFX10-NEXT: v_max_i32_e32 v6, v13, v6 -; GFX10-NEXT: v_max_i32_e32 v13, -1, v3 -; GFX10-NEXT: v_min_i32_e32 v16, -1, v2 -; GFX10-NEXT: v_min_i32_e32 v23, -1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s4, v10 -; GFX10-NEXT: v_min_i32_e32 v19, -1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v13, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, v17, v11 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 -; GFX10-NEXT: v_max_i32_e32 v7, v10, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s5, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v15, s5, v15 -; GFX10-NEXT: v_subrev_nc_u32_e32 v16, s5, v16 -; GFX10-NEXT: v_max_i32_e32 v8, v13, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, v23, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v19, v14 -; GFX10-NEXT: v_max_i32_e32 v11, v11, v9 -; GFX10-NEXT: v_min_i32_e32 v5, v5, v12 -; GFX10-NEXT: v_min_i32_e32 v6, v6, v15 -; GFX10-NEXT: v_min_i32_e32 v7, v7, v16 -; GFX10-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX10-NEXT: v_min_i32_e32 v9, v11, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v9 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v5 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v6 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v7 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v8 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, v4, v9 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) @@ -2746,7 +1910,7 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX6-NEXT: s_cselect_b32 s12, s0, -1 ; GFX6-NEXT: s_sub_i32 s12, s12, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s11, 0x80000000 +; GFX6-NEXT: s_brev_b32 s11, 1 ; GFX6-NEXT: s_cselect_b32 s13, s0, -1 ; GFX6-NEXT: s_sub_i32 s13, s13, s11 ; GFX6-NEXT: s_cmp_gt_i32 s12, s5 @@ -2807,7 +1971,7 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX8-NEXT: s_cselect_b32 s12, s0, -1 ; GFX8-NEXT: s_sub_i32 s12, s12, s10 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s11, 0x80000000 +; GFX8-NEXT: s_brev_b32 s11, 1 ; GFX8-NEXT: s_cselect_b32 s13, s0, -1 ; GFX8-NEXT: s_sub_i32 s13, s13, s11 ; GFX8-NEXT: s_cmp_gt_i32 s12, s5 @@ -2863,125 +2027,36 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX9-LABEL: s_ssubsat_v5i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s10, -2 -; GFX9-NEXT: s_cselect_b32 s12, s0, -1 -; GFX9-NEXT: s_sub_i32 s12, s12, s10 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s11, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s13, s0, -1 -; GFX9-NEXT: s_sub_i32 s13, s13, s11 -; GFX9-NEXT: s_cmp_gt_i32 s12, s5 -; GFX9-NEXT: s_cselect_b32 s5, s12, s5 -; GFX9-NEXT: s_cmp_lt_i32 s5, s13 -; GFX9-NEXT: s_cselect_b32 s5, s5, s13 -; GFX9-NEXT: s_sub_i32 s0, s0, s5 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s5, s1, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s12, s1, -1 -; GFX9-NEXT: s_sub_i32 s12, s12, s11 -; GFX9-NEXT: s_cmp_gt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s5, s2, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s6, s2, -1 -; GFX9-NEXT: s_sub_i32 s6, s6, s11 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_cmp_gt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s5, s3, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s6, s3, -1 -; GFX9-NEXT: s_sub_i32 s6, s6, s11 -; GFX9-NEXT: s_cmp_gt_i32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_cmp_gt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s5, s4, -1 -; GFX9-NEXT: s_sub_i32 s5, s5, s10 -; GFX9-NEXT: s_cmp_lt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s6, s4, -1 -; GFX9-NEXT: s_sub_i32 s6, s6, s11 -; GFX9-NEXT: s_cmp_gt_i32 s5, s9 -; GFX9-NEXT: s_cselect_b32 s5, s5, s9 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_i32 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v5i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s10, -2 -; GFX10-NEXT: s_cselect_b32 s11, s0, -1 -; GFX10-NEXT: s_mov_b32 s12, 0x80000000 -; GFX10-NEXT: s_sub_i32 s11, s11, s10 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s5 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s6 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s7 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, s3, s8 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s13, s0, -1 -; GFX10-NEXT: s_sub_i32 s13, s13, s12 -; GFX10-NEXT: s_cmp_gt_i32 s11, s5 -; GFX10-NEXT: s_cselect_b32 s5, s11, s5 -; GFX10-NEXT: s_cmp_lt_i32 s5, s13 -; GFX10-NEXT: s_cselect_b32 s5, s5, s13 -; GFX10-NEXT: s_sub_i32 s0, s0, s5 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s5, s1, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s11, s1, -1 -; GFX10-NEXT: s_sub_i32 s11, s11, s12 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_lt_i32 s5, s11 -; GFX10-NEXT: s_cselect_b32 s5, s5, s11 -; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_cmp_gt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s5, s2, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_lt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s6, s2, -1 -; GFX10-NEXT: s_sub_i32 s6, s6, s12 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_cmp_gt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s5, s3, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_lt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s6, s3, -1 -; GFX10-NEXT: s_sub_i32 s6, s6, s12 -; GFX10-NEXT: s_cmp_gt_i32 s5, s8 -; GFX10-NEXT: s_cselect_b32 s5, s5, s8 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s5, s4, -1 -; GFX10-NEXT: s_sub_i32 s5, s5, s10 -; GFX10-NEXT: s_cmp_lt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s6, s4, -1 -; GFX10-NEXT: s_sub_i32 s6, s6, s12 -; GFX10-NEXT: s_cmp_gt_i32 s5, s9 -; GFX10-NEXT: s_cselect_b32 s5, s5, s9 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s4, s4, s5 ; GFX10-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result @@ -2995,7 +2070,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v32, vcc, s4, v32 ; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v32, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v32, vcc, s5, v32 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v32 @@ -3018,7 +2093,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_mov_b32_e32 v18, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 @@ -3116,7 +2191,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v32, vcc, s4, v32 ; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX8-NEXT: s_mov_b32 s5, 0x80000000 +; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v32, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v32, vcc, s5, v32 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v32 @@ -3139,7 +2214,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_mov_b32_e32 v18, 0x80000000 +; GFX8-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 @@ -3233,244 +2308,44 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX9-LABEL: v_ssubsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v32, s4, v32 -; GFX9-NEXT: v_max_i32_e32 v16, v32, v16 -; GFX9-NEXT: s_mov_b32 s5, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v32, -1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v32, s5, v32 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v32 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_max_i32_e32 v16, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v16, s4, v16 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_min_i32_e32 v17, -1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v17, s5, v17 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_max_i32_e32 v16, -1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v16, s4, v16 -; GFX9-NEXT: v_min_i32_e32 v17, -1, v2 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v18 -; GFX9-NEXT: v_subrev_u32_e32 v17, s5, v17 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_bfrev_b32_e32 v16, -2 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_mov_b32_e32 v18, 0x80000000 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v4 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v5 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v6 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v7 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v7, v7, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v24 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v8, v8, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v25 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v9, v9, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v26 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v10, v10, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v27 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v11, v11, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v28 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v29 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v13, v13, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX9-NEXT: v_max_i32_e32 v17, v17, v30 -; GFX9-NEXT: v_sub_u32_e32 v19, v19, v18 -; GFX9-NEXT: v_min_i32_e32 v17, v17, v19 -; GFX9-NEXT: v_sub_u32_e32 v14, v14, v17 -; GFX9-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX9-NEXT: v_sub_u32_e32 v16, v17, v16 -; GFX9-NEXT: v_min_i32_e32 v17, -1, v15 -; GFX9-NEXT: v_sub_u32_e32 v17, v17, v18 -; GFX9-NEXT: v_max_i32_e32 v16, v16, v31 -; GFX9-NEXT: v_min_i32_e32 v16, v16, v17 -; GFX9-NEXT: v_sub_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_sub_i32 v0, v0, v16 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v17 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v18 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v19 clamp +; GFX9-NEXT: v_sub_i32 v4, v4, v20 clamp +; GFX9-NEXT: v_sub_i32 v5, v5, v21 clamp +; GFX9-NEXT: v_sub_i32 v6, v6, v22 clamp +; GFX9-NEXT: v_sub_i32 v7, v7, v23 clamp +; GFX9-NEXT: v_sub_i32 v8, v8, v24 clamp +; GFX9-NEXT: v_sub_i32 v9, v9, v25 clamp +; GFX9-NEXT: v_sub_i32 v10, v10, v26 clamp +; GFX9-NEXT: v_sub_i32 v11, v11, v27 clamp +; GFX9-NEXT: v_sub_i32 v12, v12, v28 clamp +; GFX9-NEXT: v_sub_i32 v13, v13, v29 clamp +; GFX9-NEXT: v_sub_i32 v14, v14, v30 clamp +; GFX9-NEXT: v_sub_i32 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX10-NEXT: s_brev_b32 s4, -2 -; GFX10-NEXT: v_min_i32_e32 v33, -1, v0 -; GFX10-NEXT: s_mov_b32 s5, 0x80000000 -; GFX10-NEXT: v_max_i32_e32 v36, -1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v35, s4, v32 -; GFX10-NEXT: v_max_i32_e32 v32, -1, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v33, s5, v33 -; GFX10-NEXT: v_bfrev_b32_e32 v34, -2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v36, s4, v36 -; GFX10-NEXT: v_max_i32_e32 v16, v35, v16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v32, s4, v32 -; GFX10-NEXT: v_max_i32_e32 v39, -1, v3 -; GFX10-NEXT: v_min_i32_e32 v37, -1, v1 -; GFX10-NEXT: v_max_i32_e32 v18, v36, v18 -; GFX10-NEXT: v_min_i32_e32 v16, v16, v33 -; GFX10-NEXT: v_min_i32_e32 v33, -1, v2 -; GFX10-NEXT: v_max_i32_e32 v38, v32, v17 -; GFX10-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v36, v39, v34 -; GFX10-NEXT: v_mov_b32_e32 v35, 0x80000000 -; GFX10-NEXT: v_subrev_nc_u32_e32 v32, s5, v33 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v16 -; GFX10-NEXT: v_min_i32_e32 v33, -1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v17, v34 -; GFX10-NEXT: v_subrev_nc_u32_e32 v37, s5, v37 -; GFX10-NEXT: v_min_i32_e32 v16, v18, v32 -; GFX10-NEXT: v_max_i32_e32 v19, v36, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v33, v35 -; GFX10-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX10-NEXT: v_min_i32_e32 v39, v38, v37 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v16 -; GFX10-NEXT: v_min_i32_e32 v16, -1, v4 -; GFX10-NEXT: v_min_i32_e32 v18, v19, v18 -; GFX10-NEXT: v_max_i32_e32 v19, -1, v5 -; GFX10-NEXT: v_max_i32_e32 v32, -1, v6 -; GFX10-NEXT: v_min_i32_e32 v33, -1, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v16, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v39 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v19, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v32, v32, v34 -; GFX10-NEXT: v_min_i32_e32 v36, -1, v6 -; GFX10-NEXT: v_min_i32_e32 v39, v17, v16 -; GFX10-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX10-NEXT: v_min_i32_e32 v16, -1, v7 -; GFX10-NEXT: v_max_i32_e32 v19, v19, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v33, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v39 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v17, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v36, v35 -; GFX10-NEXT: v_max_i32_e32 v22, v32, v22 -; GFX10-NEXT: v_min_i32_e32 v38, v19, v20 -; GFX10-NEXT: v_max_i32_e32 v20, -1, v9 -; GFX10-NEXT: v_max_i32_e32 v39, -1, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v16, v35 -; GFX10-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX10-NEXT: v_min_i32_e32 v19, v22, v21 -; GFX10-NEXT: v_min_i32_e32 v21, -1, v9 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v20, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v39, v34 -; GFX10-NEXT: v_max_i32_e32 v39, -1, v10 -; GFX10-NEXT: v_min_i32_e32 v16, v17, v16 -; GFX10-NEXT: v_min_i32_e32 v22, -1, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v38 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, v6, v19 -; GFX10-NEXT: v_max_i32_e32 v18, v18, v24 -; GFX10-NEXT: v_max_i32_e32 v20, v20, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v22, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v21, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, v7, v16 -; GFX10-NEXT: v_max_i32_e32 v16, -1, v11 -; GFX10-NEXT: v_min_i32_e32 v38, -1, v10 -; GFX10-NEXT: v_sub_nc_u32_e32 v23, v39, v34 -; GFX10-NEXT: v_min_i32_e32 v17, v18, v19 -; GFX10-NEXT: v_min_i32_e32 v20, v20, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v16, v16, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v38, v35 -; GFX10-NEXT: v_max_i32_e32 v19, v23, v26 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, v8, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, v9, v20 -; GFX10-NEXT: v_max_i32_e32 v20, -1, v13 -; GFX10-NEXT: v_max_i32_e32 v16, v16, v27 -; GFX10-NEXT: v_min_i32_e32 v17, v19, v18 -; GFX10-NEXT: v_max_i32_e32 v19, -1, v12 -; GFX10-NEXT: v_max_i32_e32 v27, -1, v14 -; GFX10-NEXT: v_max_i32_e32 v23, -1, v15 -; GFX10-NEXT: v_min_i32_e32 v18, -1, v11 -; GFX10-NEXT: v_min_i32_e32 v21, -1, v13 -; GFX10-NEXT: v_sub_nc_u32_e32 v19, v19, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v20, v20, v34 -; GFX10-NEXT: v_min_i32_e32 v24, -1, v14 -; GFX10-NEXT: v_min_i32_e32 v25, -1, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v26, v23, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, v10, v17 -; GFX10-NEXT: v_min_i32_e32 v17, -1, v12 -; GFX10-NEXT: v_sub_nc_u32_e32 v27, v27, v34 -; GFX10-NEXT: v_sub_nc_u32_e32 v18, v18, v35 -; GFX10-NEXT: v_max_i32_e32 v19, v19, v28 -; GFX10-NEXT: v_sub_nc_u32_e32 v21, v21, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v17, v17, v35 -; GFX10-NEXT: v_max_i32_e32 v20, v20, v29 -; GFX10-NEXT: v_sub_nc_u32_e32 v24, v24, v35 -; GFX10-NEXT: v_max_i32_e32 v22, v27, v30 -; GFX10-NEXT: v_sub_nc_u32_e32 v25, v25, v35 -; GFX10-NEXT: v_max_i32_e32 v23, v26, v31 -; GFX10-NEXT: v_min_i32_e32 v16, v16, v18 -; GFX10-NEXT: v_min_i32_e32 v17, v19, v17 -; GFX10-NEXT: v_min_i32_e32 v18, v20, v21 -; GFX10-NEXT: v_min_i32_e32 v19, v22, v24 -; GFX10-NEXT: v_min_i32_e32 v20, v23, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, v11, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, v12, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v13, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v14, v14, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, v15, v20 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, v3, v19 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, v4, v20 clamp +; GFX10-NEXT: v_sub_nc_i32 v5, v5, v21 clamp +; GFX10-NEXT: v_sub_nc_i32 v6, v6, v22 clamp +; GFX10-NEXT: v_sub_nc_i32 v7, v7, v23 clamp +; GFX10-NEXT: v_sub_nc_i32 v8, v8, v24 clamp +; GFX10-NEXT: v_sub_nc_i32 v9, v9, v25 clamp +; GFX10-NEXT: v_sub_nc_i32 v10, v10, v26 clamp +; GFX10-NEXT: v_sub_nc_i32 v11, v11, v27 clamp +; GFX10-NEXT: v_sub_nc_i32 v12, v12, v28 clamp +; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp +; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp +; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) @@ -3485,7 +2360,7 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX6-NEXT: s_cselect_b32 s34, s0, -1 ; GFX6-NEXT: s_sub_i32 s34, s34, s32 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s33, 0x80000000 +; GFX6-NEXT: s_brev_b32 s33, 1 ; GFX6-NEXT: s_cselect_b32 s35, s0, -1 ; GFX6-NEXT: s_sub_i32 s35, s35, s33 ; GFX6-NEXT: s_cmp_gt_i32 s34, s16 @@ -3667,7 +2542,7 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX8-NEXT: s_cselect_b32 s34, s0, -1 ; GFX8-NEXT: s_sub_i32 s34, s34, s32 ; GFX8-NEXT: s_cmp_lt_i32 s0, -1 -; GFX8-NEXT: s_mov_b32 s33, 0x80000000 +; GFX8-NEXT: s_brev_b32 s33, 1 ; GFX8-NEXT: s_cselect_b32 s35, s0, -1 ; GFX8-NEXT: s_sub_i32 s35, s35, s33 ; GFX8-NEXT: s_cmp_gt_i32 s34, s16 @@ -3844,367 +2719,91 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX9-LABEL: s_ssubsat_v16i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_gt_i32 s0, -1 -; GFX9-NEXT: s_brev_b32 s32, -2 -; GFX9-NEXT: s_cselect_b32 s34, s0, -1 -; GFX9-NEXT: s_sub_i32 s34, s34, s32 -; GFX9-NEXT: s_cmp_lt_i32 s0, -1 -; GFX9-NEXT: s_mov_b32 s33, 0x80000000 -; GFX9-NEXT: s_cselect_b32 s35, s0, -1 -; GFX9-NEXT: s_sub_i32 s35, s35, s33 -; GFX9-NEXT: s_cmp_gt_i32 s34, s16 -; GFX9-NEXT: s_cselect_b32 s16, s34, s16 -; GFX9-NEXT: s_cmp_lt_i32 s16, s35 -; GFX9-NEXT: s_cselect_b32 s16, s16, s35 -; GFX9-NEXT: s_sub_i32 s0, s0, s16 -; GFX9-NEXT: s_cmp_gt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s16, s1, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s1, -1 -; GFX9-NEXT: s_cselect_b32 s34, s1, -1 -; GFX9-NEXT: s_sub_i32 s34, s34, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_cmp_lt_i32 s16, s34 -; GFX9-NEXT: s_cselect_b32 s16, s16, s34 -; GFX9-NEXT: s_sub_i32 s1, s1, s16 -; GFX9-NEXT: s_cmp_gt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s16, s2, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s2, -1 -; GFX9-NEXT: s_cselect_b32 s17, s2, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s18 -; GFX9-NEXT: s_cselect_b32 s16, s16, s18 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s2, s2, s16 -; GFX9-NEXT: s_cmp_gt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s16, s3, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s3, -1 -; GFX9-NEXT: s_cselect_b32 s17, s3, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s19 -; GFX9-NEXT: s_cselect_b32 s16, s16, s19 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s3, s3, s16 -; GFX9-NEXT: s_cmp_gt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s16, s4, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s4, -1 -; GFX9-NEXT: s_cselect_b32 s17, s4, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s20 -; GFX9-NEXT: s_cselect_b32 s16, s16, s20 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s4, s4, s16 -; GFX9-NEXT: s_cmp_gt_i32 s5, -1 -; GFX9-NEXT: s_cselect_b32 s16, s5, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s5, -1 -; GFX9-NEXT: s_cselect_b32 s17, s5, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s21 -; GFX9-NEXT: s_cselect_b32 s16, s16, s21 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s5, s5, s16 -; GFX9-NEXT: s_cmp_gt_i32 s6, -1 -; GFX9-NEXT: s_cselect_b32 s16, s6, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s6, -1 -; GFX9-NEXT: s_cselect_b32 s17, s6, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s22 -; GFX9-NEXT: s_cselect_b32 s16, s16, s22 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s6, s6, s16 -; GFX9-NEXT: s_cmp_gt_i32 s7, -1 -; GFX9-NEXT: s_cselect_b32 s16, s7, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s7, -1 -; GFX9-NEXT: s_cselect_b32 s17, s7, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s23 -; GFX9-NEXT: s_cselect_b32 s16, s16, s23 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s7, s7, s16 -; GFX9-NEXT: s_cmp_gt_i32 s8, -1 -; GFX9-NEXT: s_cselect_b32 s16, s8, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s8, -1 -; GFX9-NEXT: s_cselect_b32 s17, s8, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s24 -; GFX9-NEXT: s_cselect_b32 s16, s16, s24 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s8, s8, s16 -; GFX9-NEXT: s_cmp_gt_i32 s9, -1 -; GFX9-NEXT: s_cselect_b32 s16, s9, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s9, -1 -; GFX9-NEXT: s_cselect_b32 s17, s9, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s25 -; GFX9-NEXT: s_cselect_b32 s16, s16, s25 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s9, s9, s16 -; GFX9-NEXT: s_cmp_gt_i32 s10, -1 -; GFX9-NEXT: s_cselect_b32 s16, s10, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s10, -1 -; GFX9-NEXT: s_cselect_b32 s17, s10, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s26 -; GFX9-NEXT: s_cselect_b32 s16, s16, s26 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s10, s10, s16 -; GFX9-NEXT: s_cmp_gt_i32 s11, -1 -; GFX9-NEXT: s_cselect_b32 s16, s11, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s11, -1 -; GFX9-NEXT: s_cselect_b32 s17, s11, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s27 -; GFX9-NEXT: s_cselect_b32 s16, s16, s27 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s11, s11, s16 -; GFX9-NEXT: s_cmp_gt_i32 s12, -1 -; GFX9-NEXT: s_cselect_b32 s16, s12, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s12, -1 -; GFX9-NEXT: s_cselect_b32 s17, s12, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s28 -; GFX9-NEXT: s_cselect_b32 s16, s16, s28 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_gt_i32 s13, -1 -; GFX9-NEXT: s_cselect_b32 s16, s13, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s13, -1 -; GFX9-NEXT: s_cselect_b32 s17, s13, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s29 -; GFX9-NEXT: s_cselect_b32 s16, s16, s29 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s13, s13, s16 -; GFX9-NEXT: s_cmp_gt_i32 s14, -1 -; GFX9-NEXT: s_cselect_b32 s16, s14, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s14, -1 -; GFX9-NEXT: s_cselect_b32 s17, s14, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s30 -; GFX9-NEXT: s_cselect_b32 s16, s16, s30 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_gt_i32 s15, -1 -; GFX9-NEXT: s_cselect_b32 s16, s15, -1 -; GFX9-NEXT: s_sub_i32 s16, s16, s32 -; GFX9-NEXT: s_cmp_lt_i32 s15, -1 -; GFX9-NEXT: s_cselect_b32 s17, s15, -1 -; GFX9-NEXT: s_sub_i32 s17, s17, s33 -; GFX9-NEXT: s_cmp_gt_i32 s16, s31 -; GFX9-NEXT: s_cselect_b32 s16, s16, s31 -; GFX9-NEXT: s_cmp_lt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_sub_i32 s15, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_i32 v4, s4, v4 clamp +; GFX9-NEXT: v_sub_i32 v5, s5, v5 clamp +; GFX9-NEXT: v_sub_i32 v6, s6, v6 clamp +; GFX9-NEXT: v_sub_i32 v7, s7, v7 clamp +; GFX9-NEXT: v_sub_i32 v8, s8, v8 clamp +; GFX9-NEXT: v_sub_i32 v9, s9, v9 clamp +; GFX9-NEXT: v_sub_i32 v10, s10, v10 clamp +; GFX9-NEXT: v_sub_i32 v11, s11, v11 clamp +; GFX9-NEXT: v_sub_i32 v12, s12, v12 clamp +; GFX9-NEXT: v_sub_i32 v13, s13, v13 clamp +; GFX9-NEXT: v_sub_i32 v14, s14, v14 clamp +; GFX9-NEXT: v_sub_i32 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v16i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_gt_i32 s0, -1 -; GFX10-NEXT: s_brev_b32 s46, -2 -; GFX10-NEXT: s_cselect_b32 s33, s0, -1 -; GFX10-NEXT: s_mov_b32 s34, 0x80000000 -; GFX10-NEXT: s_sub_i32 s47, s33, s46 -; GFX10-NEXT: s_cmp_lt_i32 s0, -1 +; GFX10-NEXT: v_sub_nc_i32 v0, s0, s16 clamp +; GFX10-NEXT: v_sub_nc_i32 v1, s1, s17 clamp +; GFX10-NEXT: v_sub_nc_i32 v2, s2, s18 clamp +; GFX10-NEXT: v_sub_nc_i32 v3, s3, s19 clamp +; GFX10-NEXT: v_sub_nc_i32 v4, s4, s20 clamp +; GFX10-NEXT: v_sub_nc_i32 v5, s5, s21 clamp +; GFX10-NEXT: v_sub_nc_i32 v6, s6, s22 clamp +; GFX10-NEXT: v_sub_nc_i32 v7, s7, s23 clamp +; GFX10-NEXT: v_sub_nc_i32 v8, s8, s24 clamp +; GFX10-NEXT: v_sub_nc_i32 v9, s9, s25 clamp +; GFX10-NEXT: v_sub_nc_i32 v10, s10, s26 clamp +; GFX10-NEXT: v_sub_nc_i32 v11, s11, s27 clamp +; GFX10-NEXT: v_sub_nc_i32 v12, s12, s28 clamp +; GFX10-NEXT: v_sub_nc_i32 v13, s13, s29 clamp +; GFX10-NEXT: v_sub_nc_i32 v14, s14, s30 clamp +; GFX10-NEXT: v_sub_nc_i32 v15, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 +; GFX10-NEXT: v_readfirstlane_b32 s14, v14 +; GFX10-NEXT: v_readfirstlane_b32 s15, v15 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s35, s0, -1 -; GFX10-NEXT: s_sub_i32 s35, s35, s34 -; GFX10-NEXT: s_cmp_gt_i32 s47, s16 -; GFX10-NEXT: s_cselect_b32 s16, s47, s16 -; GFX10-NEXT: s_cmp_lt_i32 s16, s35 -; GFX10-NEXT: s_cselect_b32 s47, s16, s35 -; GFX10-NEXT: s_sub_i32 s0, s0, s47 -; GFX10-NEXT: s_cmp_gt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s16, s1, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s1, -1 -; GFX10-NEXT: s_cselect_b32 s33, s1, -1 -; GFX10-NEXT: s_sub_i32 s47, s33, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_cmp_lt_i32 s16, s47 -; GFX10-NEXT: s_cselect_b32 s47, s16, s47 -; GFX10-NEXT: s_sub_i32 s1, s1, s47 -; GFX10-NEXT: s_cmp_gt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s16, s2, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s2, -1 -; GFX10-NEXT: s_cselect_b32 s17, s2, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s18 -; GFX10-NEXT: s_cselect_b32 s16, s16, s18 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s2, s2, s16 -; GFX10-NEXT: s_cmp_gt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s16, s3, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s3, -1 -; GFX10-NEXT: s_cselect_b32 s17, s3, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s19 -; GFX10-NEXT: s_cselect_b32 s16, s16, s19 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s3, s3, s16 -; GFX10-NEXT: s_cmp_gt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s16, s4, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s4, -1 -; GFX10-NEXT: s_cselect_b32 s17, s4, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s20 -; GFX10-NEXT: s_cselect_b32 s16, s16, s20 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s4, s4, s16 -; GFX10-NEXT: s_cmp_gt_i32 s5, -1 -; GFX10-NEXT: s_cselect_b32 s16, s5, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s5, -1 -; GFX10-NEXT: s_cselect_b32 s17, s5, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s21 -; GFX10-NEXT: s_cselect_b32 s16, s16, s21 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_gt_i32 s6, -1 -; GFX10-NEXT: s_cselect_b32 s16, s6, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s6, -1 -; GFX10-NEXT: s_cselect_b32 s17, s6, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s22 -; GFX10-NEXT: s_cselect_b32 s16, s16, s22 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s6, s6, s16 -; GFX10-NEXT: s_cmp_gt_i32 s7, -1 -; GFX10-NEXT: s_cselect_b32 s16, s7, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s7, -1 -; GFX10-NEXT: s_cselect_b32 s17, s7, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s23 -; GFX10-NEXT: s_cselect_b32 s16, s16, s23 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s7, s7, s16 -; GFX10-NEXT: s_cmp_gt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s16, s8, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s8, -1 -; GFX10-NEXT: s_cselect_b32 s17, s8, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s24 -; GFX10-NEXT: s_cselect_b32 s16, s16, s24 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s8, s8, s16 -; GFX10-NEXT: s_cmp_gt_i32 s9, -1 -; GFX10-NEXT: s_cselect_b32 s16, s9, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s9, -1 -; GFX10-NEXT: s_cselect_b32 s17, s9, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s25 -; GFX10-NEXT: s_cselect_b32 s16, s16, s25 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_cmp_gt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s16, s10, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s10, -1 -; GFX10-NEXT: s_cselect_b32 s17, s10, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s26 -; GFX10-NEXT: s_cselect_b32 s16, s16, s26 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s10, s10, s16 -; GFX10-NEXT: s_cmp_gt_i32 s11, -1 -; GFX10-NEXT: s_cselect_b32 s16, s11, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s11, -1 -; GFX10-NEXT: s_cselect_b32 s17, s11, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s27 -; GFX10-NEXT: s_cselect_b32 s16, s16, s27 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_gt_i32 s12, -1 -; GFX10-NEXT: s_cselect_b32 s16, s12, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s12, -1 -; GFX10-NEXT: s_cselect_b32 s17, s12, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s28 -; GFX10-NEXT: s_cselect_b32 s16, s16, s28 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_gt_i32 s13, -1 -; GFX10-NEXT: s_cselect_b32 s16, s13, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s13, -1 -; GFX10-NEXT: s_cselect_b32 s17, s13, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s29 -; GFX10-NEXT: s_cselect_b32 s16, s16, s29 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s13, s13, s16 -; GFX10-NEXT: s_cmp_gt_i32 s14, -1 -; GFX10-NEXT: s_cselect_b32 s16, s14, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s14, -1 -; GFX10-NEXT: s_cselect_b32 s17, s14, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s30 -; GFX10-NEXT: s_cselect_b32 s16, s16, s30 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s14, s14, s16 -; GFX10-NEXT: s_cmp_gt_i32 s15, -1 -; GFX10-NEXT: s_cselect_b32 s16, s15, -1 -; GFX10-NEXT: s_sub_i32 s16, s16, s46 -; GFX10-NEXT: s_cmp_lt_i32 s15, -1 -; GFX10-NEXT: s_cselect_b32 s17, s15, -1 -; GFX10-NEXT: s_sub_i32 s17, s17, s34 -; GFX10-NEXT: s_cmp_gt_i32 s16, s31 -; GFX10-NEXT: s_cselect_b32 s16, s16, s31 -; GFX10-NEXT: s_cmp_lt_i32 s16, s17 -; GFX10-NEXT: s_cselect_b32 s16, s16, s17 -; GFX10-NEXT: s_sub_i32 s15, s15, s16 ; GFX10-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -4229,9 +2828,8 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-LABEL: v_ssubsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX8-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 @@ -4242,29 +2840,15 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-LABEL: v_ssubsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_max_i16_e32 v2, s4, v0 -; GFX9-NEXT: v_min_i16_e32 v3, s4, v0 -; GFX9-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 -; GFX9-NEXT: v_max_i16_e32 v1, v2, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v2, v0, s4 -; GFX10-NEXT: v_min_i16_e64 v3, v0, s4 -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v3, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v3 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -4292,13 +2876,13 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_sext_i32_i16 s3, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_cmp_gt_i32 s2, s3 ; GFX8-NEXT: s_cselect_b32 s4, s2, s3 ; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff ; GFX8-NEXT: s_cmp_lt_i32 s2, s3 ; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_sub_i32 s2, s2, 0x8000 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_cmp_gt_i32 s3, s1 @@ -4312,45 +2896,16 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: s_ssubsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s2, s0 -; GFX9-NEXT: s_sext_i32_i16 s3, 0xffff -; GFX9-NEXT: s_cmp_gt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s2, s3 -; GFX9-NEXT: s_sub_i32 s4, s4, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, 0x8000 -; GFX9-NEXT: s_sext_i32_i16 s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s2, 0xffff -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_gt_i32 s3, s2 +; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s3, s2 -; GFX10-NEXT: s_sub_i32 s4, s4, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s3, s2 -; GFX10-NEXT: s_cselect_b32 s2, s3, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s4 -; GFX10-NEXT: s_sub_i32 s2, s2, 0x8000 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s2, s2 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -4376,13 +2931,13 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX8-LABEL: ssubsat_i16_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s1, s0 -; GFX8-NEXT: s_sext_i32_i16 s2, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s2, -1 ; GFX8-NEXT: s_cmp_gt_i32 s1, s2 ; GFX8-NEXT: s_cselect_b32 s3, s1, s2 ; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fff ; GFX8-NEXT: s_cmp_lt_i32 s1, s2 ; GFX8-NEXT: s_cselect_b32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s1, s1, 0x8000 +; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 ; GFX8-NEXT: v_max_i16_e32 v0, s3, v0 ; GFX8-NEXT: v_min_i16_e32 v0, s1, v0 ; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0 @@ -4390,33 +2945,13 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX9-LABEL: ssubsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s1, s0 -; GFX9-NEXT: s_sext_i32_i16 s2, 0xffff -; GFX9-NEXT: s_cmp_gt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s3, s1, s2 -; GFX9-NEXT: s_sub_i32 s3, s3, 0x7fff -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s1, s1, 0x8000 -; GFX9-NEXT: v_max_i16_e32 v0, s3, v0 -; GFX9-NEXT: v_min_i16_e32 v0, s1, v0 -; GFX9-NEXT: v_sub_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s1, s0 -; GFX10-NEXT: s_sext_i32_i16 s2, 0xffff +; GFX10-NEXT: v_sub_nc_i16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_gt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s3, s1, s2 -; GFX10-NEXT: s_sub_i32 s3, s3, 0x7fff -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: v_max_i16_e64 v0, s3, v0 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s1, s1, 0x8000 -; GFX10-NEXT: v_min_i16_e64 v0, v0, s1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -4440,10 +2975,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: ssubsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: v_max_i16_e32 v1, s1, v0 +; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 -; GFX8-NEXT: v_min_i16_e32 v2, s1, v0 +; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 @@ -4452,27 +2986,13 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX9-LABEL: ssubsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: v_max_i16_e32 v1, s1, v0 -; GFX9-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_min_i16_e32 v2, s1, v0 -; GFX9-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 -; GFX9-NEXT: v_max_i16_e32 v1, s0, v1 -; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_i16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: v_sub_nc_i16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_max_i16_e64 v1, v0, s1 -; GFX10-NEXT: v_min_i16_e64 v2, v0, s1 -; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 0x7fff -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 0x8000 -; GFX10-NEXT: v_max_i16_e64 v1, v1, s0 -; GFX10-NEXT: v_min_i16_e64 v1, v1, v2 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -4488,7 +3008,7 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 @@ -4510,18 +3030,17 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v3, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v3, s4, v3 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v4, s6, v0 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 -; GFX8-NEXT: v_max_i16_e32 v4, s6, v2 -; GFX8-NEXT: v_min_i16_e32 v5, s6, v2 +; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 ; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -4534,37 +3053,15 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v2, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s5 -; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 -; GFX9-NEXT: v_pk_min_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v2, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_min_i16 v3, v0, s4 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s6 -; GFX10-NEXT: v_pk_sub_i16 v3, v3, s4 -; GFX10-NEXT: v_pk_max_i16 v1, v2, v1 -; GFX10-NEXT: v_pk_min_i16 v1, v1, v3 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result @@ -4580,7 +3077,7 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_cselect_b32 s6, s0, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cselect_b32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_cmp_gt_i32 s6, s2 @@ -4615,13 +3112,13 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_sext_i32_i16 s6, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s7, -1 ; GFX8-NEXT: s_cmp_gt_i32 s6, s7 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: s_cselect_b32 s8, s6, s7 ; GFX8-NEXT: s_sub_i32 s8, s8, s4 ; GFX8-NEXT: s_cmp_lt_i32 s6, s7 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: s_cselect_b32 s6, s6, s7 ; GFX8-NEXT: s_sub_i32 s6, s6, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4657,115 +3154,16 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_ssubsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s7, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 -; GFX9-NEXT: s_ashr_i32 s6, s0, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s8, s5, s7 -; GFX9-NEXT: s_cmp_gt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_lshr_b32 s9, s8, 16 -; GFX9-NEXT: s_sub_i32 s2, s8, s2 -; GFX9-NEXT: s_sub_i32 s8, s9, s10 -; GFX9-NEXT: s_cmp_lt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, s4 -; GFX9-NEXT: s_mov_b32 s3, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s3 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_lt_i32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 -; GFX10-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s5 -; GFX10-NEXT: s_movk_i32 s8, 0x7fff -; GFX10-NEXT: s_cselect_b32 s6, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, s2 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s8, s8 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_lshr_b32 s9, s7, 16 -; GFX10-NEXT: s_sub_i32 s6, s6, s7 -; GFX10-NEXT: s_sub_i32 s7, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_sub_i32 s4, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_sext_i32_i16 s6, s1 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s5, s1 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_sub_i32 s1, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to i32 @@ -4782,7 +3180,7 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: s_sub_i32 s4, s4, s2 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: s_cselect_b32 s5, s0, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s3 ; GFX6-NEXT: v_max_i32_e32 v0, s4, v0 @@ -4812,13 +3210,13 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s5, -1 ; GFX8-NEXT: s_cmp_gt_i32 s4, s5 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: s_cselect_b32 s6, s4, s5 ; GFX8-NEXT: s_sub_i32 s6, s6, s2 ; GFX8-NEXT: s_cmp_lt_i32 s4, s5 -; GFX8-NEXT: s_mov_b32 s3, 0x8000 +; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: s_cselect_b32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s4, s4, s3 ; GFX8-NEXT: v_max_i16_e32 v1, s6, v0 @@ -4841,75 +3239,13 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: ssubsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s3, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: s_ashr_i32 s5, s0, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s7, s4, s6 -; GFX9-NEXT: s_cmp_gt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_cselect_b32 s8, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s9, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_sub_i32 s1, s7, s1 -; GFX9-NEXT: s_sub_i32 s7, s8, s9 -; GFX9-NEXT: s_cmp_lt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s3 -; GFX9-NEXT: s_mov_b32 s2, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_pk_max_i16 v0, s1, v0 -; GFX9-NEXT: v_pk_min_i16 v0, v0, s2 -; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_ashr_i32 s3, s0, 16 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s4 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s5, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s7 -; GFX10-NEXT: s_lshr_b32 s7, s5, 16 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_sub_i32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s6, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s1 -; GFX10-NEXT: s_mov_b32 s4, 0xffff8000 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s5, s6 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: v_pk_max_i16 v0, s2, v0 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s2, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX10-NEXT: v_pk_min_i16 v0, v0, s1 -; GFX10-NEXT: v_pk_sub_i16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -4924,7 +3260,7 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 @@ -4950,18 +3286,17 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v2, s4, v0 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v2, s2, v2 -; GFX8-NEXT: s_mov_b32 s3, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v3, s4, v0 +; GFX8-NEXT: s_movk_i32 s3, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v3, s3, v3 ; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 -; GFX8-NEXT: v_max_i16_e32 v3, s4, v1 -; GFX8-NEXT: v_min_i16_e32 v4, s4, v1 +; GFX8-NEXT: v_max_i16_e32 v3, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_subrev_u16_e32 v3, s2, v3 ; GFX8-NEXT: v_subrev_u16_e32 v4, s3, v4 @@ -4974,35 +3309,13 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: ssubsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s3, -1, -1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: v_pk_max_i16 v1, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: v_pk_min_i16 v2, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s2 -; GFX9-NEXT: v_pk_max_i16 v1, v1, s0 -; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX10-NEXT: s_movk_i32 s2, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v1, v0, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX10-NEXT: v_pk_min_i16 v2, v0, s1 -; GFX10-NEXT: s_mov_b32 s3, 0xffff8000 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v1, v1, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s3 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, s1 -; GFX10-NEXT: v_pk_max_i16 v1, v1, s0 -; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -5029,7 +3342,7 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 @@ -5056,7 +3369,7 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 @@ -5082,30 +3395,29 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v6, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v6, s4, v6 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v7, s6, v0 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v7, s5, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v2 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 -; GFX8-NEXT: v_max_i16_e32 v7, s6, v4 -; GFX8-NEXT: v_min_i16_e32 v8, s6, v4 +; GFX8-NEXT: v_max_i16_e32 v7, -1, v4 +; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 ; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 ; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v7, s6, v1 +; GFX8-NEXT: v_max_i16_e32 v7, -1, v1 ; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v8 ; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 -; GFX8-NEXT: v_min_i16_e32 v8, s6, v1 +; GFX8-NEXT: v_min_i16_e32 v8, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v8 -; GFX8-NEXT: v_max_i16_e32 v8, s6, v5 +; GFX8-NEXT: v_max_i16_e32 v8, -1, v5 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 ; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 ; GFX8-NEXT: v_subrev_u16_e32 v9, s5, v9 @@ -5122,51 +3434,17 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v4, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v5, v0, s6 -; GFX9-NEXT: v_pk_max_i16 v2, v4, v2 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 -; GFX9-NEXT: v_pk_min_i16 v2, v2, v5 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s4 -; GFX9-NEXT: v_pk_min_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s5 -; GFX9-NEXT: v_pk_max_i16 v2, v2, v3 -; GFX9-NEXT: v_pk_min_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, -1, -1 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v4, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v5, v1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_pk_min_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v7, v1, s5 -; GFX10-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX10-NEXT: v_pk_sub_i16 v5, v5, s4 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 -; GFX10-NEXT: v_pk_sub_i16 v6, v6, s6 -; GFX10-NEXT: v_pk_sub_i16 v4, v7, s6 -; GFX10-NEXT: v_pk_max_i16 v3, v5, v3 -; GFX10-NEXT: v_pk_min_i16 v2, v11, v6 -; GFX10-NEXT: v_pk_min_i16 v3, v3, v4 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> @@ -5183,7 +3461,7 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_cselect_b32 s10, s0, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_cselect_b32 s11, s0, -1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_cmp_gt_i32 s10, s4 @@ -5252,13 +3530,13 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_sext_i32_i16 s10, s0 -; GFX8-NEXT: s_sext_i32_i16 s11, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s11, -1 ; GFX8-NEXT: s_cmp_gt_i32 s10, s11 ; GFX8-NEXT: s_movk_i32 s8, 0x7fff ; GFX8-NEXT: s_cselect_b32 s12, s10, s11 ; GFX8-NEXT: s_sub_i32 s12, s12, s8 ; GFX8-NEXT: s_cmp_lt_i32 s10, s11 -; GFX8-NEXT: s_mov_b32 s9, 0x8000 +; GFX8-NEXT: s_movk_i32 s9, 0x8000 ; GFX8-NEXT: s_cselect_b32 s10, s10, s11 ; GFX8-NEXT: s_sub_i32 s10, s10, s9 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 @@ -5330,201 +3608,21 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s9, s6 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s10, s7, s9 -; GFX9-NEXT: s_cmp_gt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_cselect_b32 s11, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: s_lshr_b32 s11, s10, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 -; GFX9-NEXT: s_sub_i32 s10, s10, s4 -; GFX9-NEXT: s_sub_i32 s11, s11, s12 -; GFX9-NEXT: s_cmp_lt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s8, s6 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s8, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_lshr_b32 s11, s5, 16 -; GFX9-NEXT: s_sub_i32 s7, s7, s5 -; GFX9-NEXT: s_sub_i32 s8, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_sext_i32_i16 s8, s10 -; GFX9-NEXT: s_sext_i32_i16 s13, s2 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s8, s13 -; GFX9-NEXT: s_cselect_b32 s8, s8, s13 -; GFX9-NEXT: s_cmp_gt_i32 s10, s2 -; GFX9-NEXT: s_cselect_b32 s2, s10, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX9-NEXT: s_sext_i32_i16 s8, s2 -; GFX9-NEXT: s_sext_i32_i16 s10, s7 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 -; GFX9-NEXT: s_cmp_lt_i32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s8, s8, s10 -; GFX9-NEXT: s_cmp_lt_i32 s2, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s8, s2, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s2, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s7, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s8, s2, s9 -; GFX9-NEXT: s_cmp_gt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s10, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX9-NEXT: s_lshr_b32 s10, s8, 16 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s2, s2, s9 -; GFX9-NEXT: s_cmp_lt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s6, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_gt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_i32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_sub_i32 s2, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_ashr_i32 s6, s0, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_movk_i32 s10, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_mov_b32 s12, 0xffff8000 -; GFX10-NEXT: s_cselect_b32 s9, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s10, s10 -; GFX10-NEXT: s_lshr_b32 s10, s8, 16 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 -; GFX10-NEXT: s_sub_i32 s8, s8, s9 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_sext_i32_i16 s14, s2 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s2 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s6, s4 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s8, s10 -; GFX10-NEXT: s_lshr_b32 s8, s5, 16 -; GFX10-NEXT: s_lshr_b32 s10, s12, 16 -; GFX10-NEXT: s_sext_i32_i16 s13, s6 -; GFX10-NEXT: s_sub_i32 s5, s5, s12 -; GFX10-NEXT: s_sub_i32 s8, s8, s10 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s13, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s8 -; GFX10-NEXT: s_cselect_b32 s13, s13, s14 -; GFX10-NEXT: s_cmp_gt_i32 s6, s2 -; GFX10-NEXT: s_sext_i32_i16 s8, s5 -; GFX10-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s13, s2 -; GFX10-NEXT: s_sext_i32_i16 s6, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s2 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_sub_i32 s2, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s1 -; GFX10-NEXT: s_ashr_i32 s6, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s13, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s13 -; GFX10-NEXT: s_lshr_b32 s13, s8, 16 -; GFX10-NEXT: s_sub_i32 s8, s8, s9 -; GFX10-NEXT: s_sub_i32 s9, s13, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s8, s9 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_sext_i32_i16 s7, s5 -; GFX10-NEXT: s_sext_i32_i16 s8, s3 -; GFX10-NEXT: s_sub_i32 s4, s4, s12 -; GFX10-NEXT: s_sub_i32 s6, s6, s10 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX10-NEXT: s_cselect_b32 s7, s7, s8 -; GFX10-NEXT: s_cmp_gt_i32 s5, s3 -; GFX10-NEXT: s_sext_i32_i16 s6, s4 -; GFX10-NEXT: s_cselect_b32 s3, s5, s3 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s7, s3 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s3, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -5551,7 +3649,7 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s4, v12 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, s5, v14 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 @@ -5578,7 +3676,7 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_mov_b32_e32 v15, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 @@ -5628,58 +3726,56 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v10, s6, v0 -; GFX8-NEXT: v_subrev_u16_e32 v10, s4, v10 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v12, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v9, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v12, s5, v12 -; GFX8-NEXT: v_max_i16_e32 v10, v10, v3 -; GFX8-NEXT: v_min_i16_e32 v10, v10, v12 -; GFX8-NEXT: v_max_i16_e32 v12, s6, v6 -; GFX8-NEXT: v_min_i16_e32 v14, s6, v6 -; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_max_i16_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v12, s6, v1 -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v14 -; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_min_i16_e32 v14, s6, v1 +; GFX8-NEXT: v_subrev_u16_e32 v11, s5, v11 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 +; GFX8-NEXT: v_min_i16_e32 v9, v9, v11 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v6 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v6 +; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v1 +; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v13 +; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 +; GFX8-NEXT: v_min_i16_e32 v11, v11, v13 +; GFX8-NEXT: v_max_i16_e32 v13, -1, v7 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v7 +; GFX8-NEXT: v_subrev_u16_e32 v13, s4, v13 +; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 -; GFX8-NEXT: v_min_i16_e32 v12, v12, v14 -; GFX8-NEXT: v_max_i16_e32 v14, s6, v7 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX8-NEXT: v_min_i16_e32 v15, v7, v9 -; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_subrev_u16_e32 v15, s5, v15 -; GFX8-NEXT: v_mov_b32_e32 v11, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v14, v2, v9 -; GFX8-NEXT: v_sub_u16_e32 v14, v14, v11 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v15 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v15, v2, v9 +; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v13, -1, v2 +; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 +; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v15, v15, v13 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v14, v14, v15 -; GFX8-NEXT: v_max_i16_e32 v15, v8, v9 -; GFX8-NEXT: v_min_i16_e32 v9, v8, v9 -; GFX8-NEXT: v_sub_u16_e32 v11, v15, v11 -; GFX8-NEXT: v_sub_u16_e32 v9, v9, v13 -; GFX8-NEXT: v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v10 +; GFX8-NEXT: v_sub_u16_e32 v14, v14, v12 +; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 +; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 +; GFX8-NEXT: v_sub_u16_e32 v10, v14, v10 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 +; GFX8-NEXT: v_sub_u16_e32 v12, v14, v12 +; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v9 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v12 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v11 ; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v13 ; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5687,65 +3783,19 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v6, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, v6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v7, v0, s6 -; GFX9-NEXT: v_pk_max_i16 v3, v6, v3 -; GFX9-NEXT: v_pk_sub_i16 v7, v7, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v7 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s4 -; GFX9-NEXT: v_pk_min_i16 v6, v1, s6 -; GFX9-NEXT: v_pk_max_i16 v3, v3, v4 -; GFX9-NEXT: v_pk_sub_i16 v6, v6, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v6 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s4 -; GFX9-NEXT: v_pk_min_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s5 -; GFX9-NEXT: v_pk_max_i16 v3, v3, v5 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v4 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v6i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, -1, -1 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v8, v1, s5 -; GFX10-NEXT: v_pk_max_i16 v9, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_pk_min_i16 v7, v0, s5 -; GFX10-NEXT: v_pk_sub_i16 v6, v6, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, v8, s4 -; GFX10-NEXT: v_pk_sub_i16 v19, v9, s4 -; GFX10-NEXT: v_pk_min_i16 v10, v1, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v2, s5 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 -; GFX10-NEXT: v_pk_max_i16 v14, v6, v3 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v7, v7, s6 -; GFX10-NEXT: v_pk_sub_i16 v6, v10, s6 -; GFX10-NEXT: v_pk_sub_i16 v8, v11, s6 -; GFX10-NEXT: v_pk_max_i16 v5, v19, v5 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v3 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v1, v4 clamp +; GFX10-NEXT: v_pk_sub_i16 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_i16 v3, v14, v7 -; GFX10-NEXT: v_pk_min_i16 v4, v4, v6 -; GFX10-NEXT: v_pk_min_i16 v5, v5, v8 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x float> @@ -5762,7 +3812,7 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_cselect_b32 s14, s0, -1 ; GFX6-NEXT: s_sub_i32 s14, s14, s12 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s13, 0x80000000 +; GFX6-NEXT: s_brev_b32 s13, 1 ; GFX6-NEXT: s_cselect_b32 s15, s0, -1 ; GFX6-NEXT: s_sub_i32 s15, s15, s13 ; GFX6-NEXT: s_cmp_gt_i32 s14, s6 @@ -5865,13 +3915,13 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_sext_i32_i16 s14, s0 -; GFX8-NEXT: s_sext_i32_i16 s15, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s15, -1 ; GFX8-NEXT: s_cmp_gt_i32 s14, s15 ; GFX8-NEXT: s_movk_i32 s12, 0x7fff ; GFX8-NEXT: s_cselect_b32 s16, s14, s15 ; GFX8-NEXT: s_sub_i32 s16, s16, s12 ; GFX8-NEXT: s_cmp_lt_i32 s14, s15 -; GFX8-NEXT: s_mov_b32 s13, 0x8000 +; GFX8-NEXT: s_movk_i32 s13, 0x8000 ; GFX8-NEXT: s_cselect_b32 s14, s14, s15 ; GFX8-NEXT: s_sub_i32 s14, s14, s13 ; GFX8-NEXT: s_sext_i32_i16 s16, s16 @@ -5979,287 +4029,26 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s11, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 -; GFX9-NEXT: s_ashr_i32 s8, s8, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s12, s9, s11 -; GFX9-NEXT: s_cmp_gt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX9-NEXT: s_lshr_b32 s13, s12, 16 -; GFX9-NEXT: s_lshr_b32 s14, s6, 16 -; GFX9-NEXT: s_sub_i32 s12, s12, s6 -; GFX9-NEXT: s_sub_i32 s13, s13, s14 -; GFX9-NEXT: s_cmp_lt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s9, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s10, s8 -; GFX9-NEXT: s_mov_b32 s7, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s10, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s13, s7, 16 -; GFX9-NEXT: s_sub_i32 s9, s9, s7 -; GFX9-NEXT: s_sub_i32 s10, s10, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_sext_i32_i16 s10, s12 -; GFX9-NEXT: s_sext_i32_i16 s15, s3 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s10, s15 -; GFX9-NEXT: s_cselect_b32 s10, s10, s15 -; GFX9-NEXT: s_cmp_gt_i32 s12, s3 -; GFX9-NEXT: s_cselect_b32 s3, s12, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX9-NEXT: s_sext_i32_i16 s10, s3 -; GFX9-NEXT: s_sext_i32_i16 s12, s9 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 -; GFX9-NEXT: s_cmp_lt_i32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s10, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s3, s9 -; GFX9-NEXT: s_cselect_b32 s3, s3, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s3, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_ashr_i32 s9, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s10, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s12, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_sub_i32 s10, s10, s6 -; GFX9-NEXT: s_sub_i32 s12, s12, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s9, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s9, s9, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_sext_i32_i16 s9, s10 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s12 -; GFX9-NEXT: s_cselect_b32 s9, s9, s12 -; GFX9-NEXT: s_cmp_gt_i32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s4 -; GFX9-NEXT: s_sext_i32_i16 s9, s4 -; GFX9-NEXT: s_sext_i32_i16 s10, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_lt_i32 s9, s10 -; GFX9-NEXT: s_cselect_b32 s9, s9, s10 -; GFX9-NEXT: s_cmp_lt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_ashr_i32 s4, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s9, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s10, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_sub_i32 s6, s9, s6 -; GFX9-NEXT: s_sub_i32 s9, s10, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s6 -; GFX9-NEXT: s_sext_i32_i16 s7, s5 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_gt_i32 s6, s5 -; GFX9-NEXT: s_cselect_b32 s5, s6, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s7, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 -; GFX10-NEXT: s_ashr_i32 s8, s0, 16 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 -; GFX10-NEXT: s_movk_i32 s12, 0x7fff -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_mov_b32 s14, 0xffff8000 -; GFX10-NEXT: s_cselect_b32 s11, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s12, s12 -; GFX10-NEXT: s_lshr_b32 s12, s10, 16 -; GFX10-NEXT: s_lshr_b32 s13, s11, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_sext_i32_i16 s16, s3 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s3 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, s1, s4 clamp +; GFX10-NEXT: v_pk_sub_i16 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s12 -; GFX10-NEXT: s_lshr_b32 s10, s7, 16 -; GFX10-NEXT: s_lshr_b32 s12, s14, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s8 -; GFX10-NEXT: s_sub_i32 s7, s7, s14 -; GFX10-NEXT: s_sub_i32 s10, s10, s12 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s10 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s3 -; GFX10-NEXT: s_sext_i32_i16 s10, s7 -; GFX10-NEXT: s_cselect_b32 s3, s8, s3 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s15, s3 -; GFX10-NEXT: s_sext_i32_i16 s16, s4 -; GFX10-NEXT: s_sext_i32_i16 s8, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_sub_i32 s3, s7, s8 -; GFX10-NEXT: s_sext_i32_i16 s7, s1 -; GFX10-NEXT: s_ashr_i32 s8, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s15, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s15 -; GFX10-NEXT: s_lshr_b32 s15, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s15, s15, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s15 -; GFX10-NEXT: s_lshr_b32 s10, s7, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s8 -; GFX10-NEXT: s_sub_i32 s7, s7, s14 -; GFX10-NEXT: s_sub_i32 s10, s10, s12 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s10 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s4 -; GFX10-NEXT: s_sext_i32_i16 s10, s7 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s15, s4 -; GFX10-NEXT: s_sext_i32_i16 s8, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 -; GFX10-NEXT: s_cmp_lt_i32 s4, s7 -; GFX10-NEXT: s_cselect_b32 s4, s4, s7 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 -; GFX10-NEXT: s_lshr_b32 s8, s4, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_sub_i32 s4, s7, s8 -; GFX10-NEXT: s_sext_i32_i16 s7, s2 -; GFX10-NEXT: s_ashr_i32 s8, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s15, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s15 -; GFX10-NEXT: s_lshr_b32 s15, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s11, s15, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s10, s11 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s9, s7 -; GFX10-NEXT: s_sext_i32_i16 s10, s5 -; GFX10-NEXT: s_sub_i32 s6, s6, s14 -; GFX10-NEXT: s_sub_i32 s8, s8, s12 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_cmp_gt_i32 s7, s5 -; GFX10-NEXT: s_sext_i32_i16 s8, s6 -; GFX10-NEXT: s_cselect_b32 s5, s7, s5 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s9, s5 -; GFX10-NEXT: s_sext_i32_i16 s7, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s7, s8 -; GFX10-NEXT: s_cselect_b32 s7, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s7, s5 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s3 -; GFX10-NEXT: s_sub_i32 s3, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -6275,7 +4064,7 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, s5, v18 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 @@ -6302,7 +4091,7 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v19, 0x80000000 +; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 @@ -6376,74 +4165,72 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v13, s6, v0 -; GFX8-NEXT: v_subrev_u16_e32 v13, s4, v13 -; GFX8-NEXT: s_mov_b32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v15, s6, v0 +; GFX8-NEXT: v_max_i16_e32 v12, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 +; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v15, s5, v15 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 -; GFX8-NEXT: v_min_i16_e32 v13, v13, v15 -; GFX8-NEXT: v_max_i16_e32 v15, s6, v8 -; GFX8-NEXT: v_min_i16_e32 v17, s6, v8 -; GFX8-NEXT: v_subrev_u16_e32 v15, s4, v15 -; GFX8-NEXT: v_max_i16_sdwa v4, v15, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v15, s6, v1 -; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v17 -; GFX8-NEXT: v_subrev_u16_e32 v15, s4, v15 -; GFX8-NEXT: v_min_i16_e32 v17, s6, v1 +; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 +; GFX8-NEXT: v_min_i16_e32 v12, v12, v14 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v1 +; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v16 +; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 +; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 +; GFX8-NEXT: v_min_i16_e32 v14, v14, v16 +; GFX8-NEXT: v_max_i16_e32 v16, -1, v9 +; GFX8-NEXT: v_min_i16_e32 v17, -1, v9 +; GFX8-NEXT: v_subrev_u16_e32 v16, s4, v16 +; GFX8-NEXT: v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v5 -; GFX8-NEXT: v_min_i16_e32 v15, v15, v17 -; GFX8-NEXT: v_max_i16_e32 v17, s6, v9 -; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff -; GFX8-NEXT: v_min_i16_e32 v18, v9, v12 -; GFX8-NEXT: v_subrev_u16_e32 v17, s4, v17 -; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_subrev_u16_e32 v18, s5, v18 -; GFX8-NEXT: v_mov_b32_e32 v14, 0x7fff -; GFX8-NEXT: v_max_i16_e32 v17, v2, v12 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v14 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v18 -; GFX8-NEXT: v_mov_b32_e32 v16, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v18, v2, v12 +; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff +; GFX8-NEXT: v_max_i16_e32 v16, -1, v2 +; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v17 +; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v17, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v16 -; GFX8-NEXT: v_max_i16_e32 v17, v17, v6 -; GFX8-NEXT: v_min_i16_e32 v17, v17, v18 -; GFX8-NEXT: v_max_i16_e32 v18, v10, v12 -; GFX8-NEXT: v_min_i16_e32 v19, v10, v12 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v14 -; GFX8-NEXT: v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v18, v3, v12 -; GFX8-NEXT: v_sub_u16_e32 v19, v19, v16 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v14 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v19 -; GFX8-NEXT: v_min_i16_e32 v19, v3, v12 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v15 +; GFX8-NEXT: v_max_i16_e32 v16, v16, v6 +; GFX8-NEXT: v_min_i16_e32 v16, v16, v17 +; GFX8-NEXT: v_max_i16_e32 v17, -1, v10 +; GFX8-NEXT: v_min_i16_e32 v18, -1, v10 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 +; GFX8-NEXT: v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v17, -1, v3 +; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v18 +; GFX8-NEXT: v_min_i16_e32 v18, -1, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_sub_u16_e32 v19, v19, v16 -; GFX8-NEXT: v_max_i16_e32 v18, v18, v7 -; GFX8-NEXT: v_min_i16_e32 v18, v18, v19 -; GFX8-NEXT: v_max_i16_e32 v19, v11, v12 -; GFX8-NEXT: v_min_i16_e32 v12, v11, v12 -; GFX8-NEXT: v_sub_u16_e32 v14, v19, v14 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v13 +; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 +; GFX8-NEXT: v_max_i16_e32 v17, v17, v7 +; GFX8-NEXT: v_min_i16_e32 v17, v17, v18 +; GFX8-NEXT: v_max_i16_e32 v18, -1, v11 +; GFX8-NEXT: v_sub_u16_e32 v13, v18, v13 +; GFX8-NEXT: v_min_i16_e32 v18, -1, v11 +; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 ; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_sub_u16_e32 v12, v12, v16 -; GFX8-NEXT: v_max_i16_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v15 +; GFX8-NEXT: v_sub_u16_e32 v15, v18, v15 +; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v14 ; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_min_i16_e32 v7, v7, v12 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v17 +; GFX8-NEXT: v_min_i16_e32 v7, v7, v15 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v16 ; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v18 +; GFX8-NEXT: v_sub_u16_e32 v3, v3, v17 ; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -6451,79 +4238,21 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v8, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, v8, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v9, v0, s6 -; GFX9-NEXT: v_pk_max_i16 v4, v8, v4 -; GFX9-NEXT: v_pk_sub_i16 v9, v9, s5 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v9 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v8, v1, s6 -; GFX9-NEXT: v_pk_max_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_sub_i16 v8, v8, s5 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v8 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v5, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 -; GFX9-NEXT: v_pk_max_i16 v4, v4, v6 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v5, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 -; GFX9-NEXT: v_pk_max_i16 v4, v4, v7 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v8, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s4 -; GFX10-NEXT: v_pk_max_i16 v12, v3, s4 -; GFX10-NEXT: v_pk_min_i16 v9, v0, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, v8, s5 -; GFX10-NEXT: v_pk_max_i16 v8, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v10, v10, s5 -; GFX10-NEXT: v_pk_sub_i16 v12, v12, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v1, s4 -; GFX10-NEXT: v_pk_min_i16 v13, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v8, v8, s5 -; GFX10-NEXT: v_pk_min_i16 v14, v3, s4 -; GFX10-NEXT: s_mov_b32 s6, 0xffff8000 -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v5, v10, v5 -; GFX10-NEXT: v_pk_sub_i16 v11, v11, s6 -; GFX10-NEXT: v_pk_sub_i16 v9, v9, s6 -; GFX10-NEXT: v_pk_max_i16 v15, v8, v6 -; GFX10-NEXT: v_pk_sub_i16 v10, v13, s6 -; GFX10-NEXT: v_pk_sub_i16 v8, v14, s6 -; GFX10-NEXT: v_pk_max_i16 v7, v12, v7 -; GFX10-NEXT: v_pk_min_i16 v19, v4, v9 -; GFX10-NEXT: v_pk_min_i16 v11, v5, v11 -; GFX10-NEXT: v_pk_min_i16 v15, v15, v10 +; GFX10-NEXT: v_pk_sub_i16 v0, v0, v4 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v1, v5 clamp +; GFX10-NEXT: v_pk_sub_i16 v2, v2, v6 clamp +; GFX10-NEXT: v_pk_sub_i16 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_min_i16 v6, v7, v8 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v19 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v11 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, v15 -; GFX10-NEXT: v_pk_sub_i16 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x float> @@ -6540,7 +4269,7 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_cselect_b32 s18, s0, -1 ; GFX6-NEXT: s_sub_i32 s18, s18, s16 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s17, 0x80000000 +; GFX6-NEXT: s_brev_b32 s17, 1 ; GFX6-NEXT: s_cselect_b32 s19, s0, -1 ; GFX6-NEXT: s_sub_i32 s19, s19, s17 ; GFX6-NEXT: s_cmp_gt_i32 s18, s8 @@ -6677,13 +4406,13 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 ; GFX8-NEXT: s_sext_i32_i16 s18, s0 -; GFX8-NEXT: s_sext_i32_i16 s19, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s19, -1 ; GFX8-NEXT: s_cmp_gt_i32 s18, s19 ; GFX8-NEXT: s_movk_i32 s16, 0x7fff ; GFX8-NEXT: s_cselect_b32 s20, s18, s19 ; GFX8-NEXT: s_sub_i32 s20, s20, s16 ; GFX8-NEXT: s_cmp_lt_i32 s18, s19 -; GFX8-NEXT: s_mov_b32 s17, 0x8000 +; GFX8-NEXT: s_movk_i32 s17, 0x8000 ; GFX8-NEXT: s_cselect_b32 s18, s18, s19 ; GFX8-NEXT: s_sub_i32 s18, s18, s17 ; GFX8-NEXT: s_sext_i32_i16 s20, s20 @@ -6827,373 +4556,31 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s10, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s13, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_ashr_i32 s12, s0, 16 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s14, s11, s13 -; GFX9-NEXT: s_cmp_gt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_cselect_b32 s15, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s8 -; GFX9-NEXT: s_lshr_b32 s15, s14, 16 -; GFX9-NEXT: s_lshr_b32 s16, s8, 16 -; GFX9-NEXT: s_sub_i32 s14, s14, s8 -; GFX9-NEXT: s_sub_i32 s15, s15, s16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s11, s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s12, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff8000 -; GFX9-NEXT: s_cselect_b32 s12, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s15, s9, 16 -; GFX9-NEXT: s_sub_i32 s11, s11, s9 -; GFX9-NEXT: s_sub_i32 s12, s12, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_sext_i32_i16 s12, s14 -; GFX9-NEXT: s_sext_i32_i16 s17, s4 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s12, s17 -; GFX9-NEXT: s_cselect_b32 s12, s12, s17 -; GFX9-NEXT: s_cmp_gt_i32 s14, s4 -; GFX9-NEXT: s_cselect_b32 s4, s14, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 -; GFX9-NEXT: s_sext_i32_i16 s14, s11 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_cmp_lt_i32 s12, s14 -; GFX9-NEXT: s_cselect_b32 s12, s12, s14 -; GFX9-NEXT: s_cmp_lt_i32 s4, s11 -; GFX9-NEXT: s_cselect_b32 s4, s4, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX9-NEXT: s_lshr_b32 s11, s0, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s11, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s11, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s12, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s14, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_sub_i32 s12, s12, s8 -; GFX9-NEXT: s_sub_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s11, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s11, s11, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_sext_i32_i16 s11, s12 -; GFX9-NEXT: s_sext_i32_i16 s14, s5 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s14 -; GFX9-NEXT: s_cselect_b32 s11, s11, s14 -; GFX9-NEXT: s_cmp_gt_i32 s12, s5 -; GFX9-NEXT: s_cselect_b32 s5, s12, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s11, s5 -; GFX9-NEXT: s_sext_i32_i16 s11, s5 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_cselect_b32 s11, s11, s12 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s5, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s11, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s12, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_sub_i32 s11, s11, s8 -; GFX9-NEXT: s_sub_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s5, s5, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s11 -; GFX9-NEXT: s_sext_i32_i16 s12, s6 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_cmp_gt_i32 s11, s6 -; GFX9-NEXT: s_cselect_b32 s6, s11, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_sext_i32_i16 s6, s5 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s6, s11 -; GFX9-NEXT: s_cselect_b32 s6, s6, s11 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s4 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s6, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s11, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 -; GFX9-NEXT: s_lshr_b32 s11, s6, 16 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s8, s11, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s5, s5, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 -; GFX9-NEXT: s_sext_i32_i16 s8, s7 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_cmp_gt_i32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_sext_i32_i16 s6, s5 -; GFX9-NEXT: s_sext_i32_i16 s7, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s6, s6, s7 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s4 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_sub_i16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_ssubsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 -; GFX10-NEXT: s_ashr_i32 s10, s0, 16 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 -; GFX10-NEXT: s_movk_i32 s14, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_mov_b32 s16, 0xffff8000 -; GFX10-NEXT: s_cselect_b32 s13, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s16, s16, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s14, s14 -; GFX10-NEXT: s_lshr_b32 s14, s12, 16 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s14, s14, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_sext_i32_i16 s18, s4 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 +; GFX10-NEXT: v_pk_sub_i16 v0, s0, s4 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, s1, s5 clamp +; GFX10-NEXT: v_pk_sub_i16 v2, s2, s6 clamp +; GFX10-NEXT: v_pk_sub_i16 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s14 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_lshr_b32 s14, s16, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 -; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 -; GFX10-NEXT: s_cmp_gt_i32 s10, s4 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 -; GFX10-NEXT: s_cselect_b32 s4, s10, s4 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s17, s4 -; GFX10-NEXT: s_sext_i32_i16 s18, s5 -; GFX10-NEXT: s_sext_i32_i16 s10, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s10, s4 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_sub_i32 s4, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s1 -; GFX10-NEXT: s_ashr_i32 s10, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s17, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s17 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 -; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 -; GFX10-NEXT: s_cmp_gt_i32 s10, s5 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 -; GFX10-NEXT: s_cselect_b32 s5, s10, s5 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX10-NEXT: s_sext_i32_i16 s18, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s5, s9 -; GFX10-NEXT: s_cselect_b32 s5, s5, s9 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s10, s5 -; GFX10-NEXT: s_lshr_b32 s10, s5, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_sub_i32 s5, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s2 -; GFX10-NEXT: s_ashr_i32 s10, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s17, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s17 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 -; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 -; GFX10-NEXT: s_cmp_gt_i32 s10, s6 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 -; GFX10-NEXT: s_cselect_b32 s6, s10, s6 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s17, s6 -; GFX10-NEXT: s_sext_i32_i16 s10, s6 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s6, s9 -; GFX10-NEXT: s_cselect_b32 s6, s6, s9 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s10, s6 -; GFX10-NEXT: s_lshr_b32 s10, s6, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s6 -; GFX10-NEXT: s_sub_i32 s6, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s3 -; GFX10-NEXT: s_ashr_i32 s10, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s13, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s8, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s12, s13 -; GFX10-NEXT: s_lshr_b32 s10, s8, 16 -; GFX10-NEXT: s_sext_i32_i16 s11, s9 -; GFX10-NEXT: s_sext_i32_i16 s12, s7 -; GFX10-NEXT: s_sub_i32 s8, s8, s16 -; GFX10-NEXT: s_sub_i32 s10, s10, s14 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_gt_i32 s11, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_cselect_b32 s11, s11, s12 -; GFX10-NEXT: s_cmp_gt_i32 s9, s7 -; GFX10-NEXT: s_sext_i32_i16 s10, s8 -; GFX10-NEXT: s_cselect_b32 s7, s9, s7 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s11, s7 -; GFX10-NEXT: s_sext_i32_i16 s9, s7 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_i32 s9, s10 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_cmp_lt_i32 s7, s8 -; GFX10-NEXT: s_cselect_b32 s4, s7, s8 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s9, s4 -; GFX10-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-NEXT: s_sub_i32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s4, s5, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 413bc71038470..cbabb07aa9361 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -21,9 +21,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -32,9 +30,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -45,9 +41,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs) @@ -69,31 +63,23 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-LABEL: s_uaddsat_i7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i7: @@ -102,14 +88,9 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -132,9 +113,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -143,9 +122,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -156,9 +133,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) @@ -180,31 +155,23 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-LABEL: s_uaddsat_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i8: @@ -213,14 +180,9 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -256,17 +218,12 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v4, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v3 -; GFX8-NEXT: v_min_u16_e32 v1, v1, v2 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -279,15 +236,10 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v4, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v2 -; GFX9-NEXT: v_min_u16_e32 v1, v1, v3 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u16_e64 v1, v2, v3 clamp ; GFX9-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -298,22 +250,17 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v2, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 -; GFX10-NEXT: v_xor_b32_e32 v4, s5, v2 -; GFX10-NEXT: v_xor_b32_e32 v5, s5, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v4, v1 -; GFX10-NEXT: v_min_u16_e64 v3, v5, v3 -; GFX10-NEXT: v_add_nc_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v3 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_nc_u16_e64 v1, v2, v1 clamp +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v3 clamp +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -351,101 +298,60 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-LABEL: s_uaddsat_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_mov_b32 s5, 0xffff -; GFX8-NEXT: s_xor_b32 s6, s0, s5 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s1 -; GFX8-NEXT: s_cselect_b32 s1, s6, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s2, s3, s4 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_xor_b32 s3, s1, s5 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s2 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: s_lshr_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s1, s3, s4 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: s_lshl_b32 s0, s2, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff +; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: s_xor_b32 s6, s0, s5 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s6, s1 -; GFX9-NEXT: s_cselect_b32 s1, s6, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s2, s3, s4 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_xor_b32 s3, s1, s5 -; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_lshr_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: s_lshl_b32 s0, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_u16_e64 v1, s0, v1 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: s_lshl_b32 s3, s0, s2 -; GFX10-NEXT: s_lshl_b32 s6, s1, s2 -; GFX10-NEXT: s_xor_b32 s5, s3, s4 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_lt_u32 s5, s6 +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_bfe_u32 s3, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_add_nc_u16_e64 v1, s2, s3 clamp +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s5 -; GFX10-NEXT: s_xor_b32 s4, s0, s4 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s3, s2 -; GFX10-NEXT: s_cmp_lt_u32 s4, s1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s3, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_or_b32 s0, s1, s0 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -508,28 +414,19 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v8, s4, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v8, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v3 -; GFX8-NEXT: v_min_u16_e32 v1, v1, v2 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v2 -; GFX8-NEXT: v_min_u16_e32 v3, v4, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u16_e64 v2, v2, v3 clamp ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_xor_b32_e32 v5, s4, v3 -; GFX8-NEXT: v_min_u16_e32 v4, v5, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX8-NEXT: v_add_u16_e64 v3, v3, v4 clamp ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -549,31 +446,22 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v8, s4, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v8, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v2 -; GFX9-NEXT: v_min_u16_e32 v1, v1, v5 -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX9-NEXT: v_add_u16_e64 v1, v2, v5 clamp ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, s4, v2 -; GFX9-NEXT: v_min_u16_e32 v3, v5, v3 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_xor_b32_e32 v5, s4, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_min_u16_e32 v4, v5, v4 +; GFX9-NEXT: v_add_u16_e64 v2, v2, v3 clamp +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e64 v3, v3, v4 clamp ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v2, v3, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -585,37 +473,28 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v6, 8, v1 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_mov_b32 s6, 24 -; GFX10-NEXT: v_xor_b32_e32 v5, s7, v2 -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v7, 8, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, s7, v4 -; GFX10-NEXT: v_min_u16_e64 v3, v5, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_e32 v11, s7, v6 -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_u16_e64 v7, v8, v7 -; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, v3 -; GFX10-NEXT: v_xor_b32_e32 v3, s7, v0 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_min_u16_e64 v5, v11, v5 -; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, v7 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_min_u16_e64 v1, v3, v1 +; GFX10-NEXT: s_mov_b32 s4, 24 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, v3 clamp +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16_e64 v5, v5, v6 clamp +; GFX10-NEXT: v_add_nc_u16_e64 v3, v4, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_nc_u16_e64 v3, v6, v5 -; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v4 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v2, v4, s4, v2 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v4, s5, v2 ; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -678,191 +557,107 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-LABEL: s_uaddsat_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_mov_b32 s9, 0xffff -; GFX8-NEXT: s_xor_b32 s10, s0, s9 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s10, s1 -; GFX8-NEXT: s_cselect_b32 s1, s10, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s8 -; GFX8-NEXT: s_lshl_b32 s2, s5, s8 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_xor_b32 s5, s1, s9 -; GFX8-NEXT: s_lshr_b32 s0, s0, s8 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s2 -; GFX8-NEXT: s_cselect_b32 s2, s5, s2 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s3, s8 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_xor_b32 s5, s2, s9 -; GFX8-NEXT: s_lshr_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s3 -; GFX8-NEXT: s_cselect_b32 s3, s5, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s3, s4, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_xor_b32 s5, s3, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s4 -; GFX8-NEXT: s_cselect_b32 s4, s5, s4 -; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_and_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s5, s8 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s0, s2, s8 +; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: s_lshl_b32 s1, s6, s8 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshl_b32 s0, s3, s8 +; GFX8-NEXT: s_lshl_b32 s1, s7, s8 +; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp +; GFX8-NEXT: s_lshl_b32 s0, s4, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_xor_b32 s10, s0, s9 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s10, s1 -; GFX9-NEXT: s_cselect_b32 s1, s10, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s8 -; GFX9-NEXT: s_lshl_b32 s2, s5, s8 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_xor_b32 s5, s1, s9 -; GFX9-NEXT: s_lshr_b32 s0, s0, s8 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, s8 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_xor_b32 s5, s2, s9 -; GFX9-NEXT: s_lshr_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s8 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_xor_b32 s5, s3, s9 -; GFX9-NEXT: s_lshr_b32 s2, s2, s8 -; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_lshr_b32 s3, s3, s8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s2, s8 +; GFX9-NEXT: s_lshl_b32 s1, s6, s8 +; GFX9-NEXT: v_add_u16_e64 v1, s0, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s3, s8 +; GFX9-NEXT: s_lshl_b32 s1, s7, s8 +; GFX9-NEXT: v_add_u16_e64 v2, s0, v2 clamp +; GFX9-NEXT: s_lshl_b32 s0, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_u16_e64 v3, s0, v3 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s5, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_add_nc_u16_e64 v1, s2, s6 clamp ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, s5 -; GFX10-NEXT: s_mov_b32 s7, 0xffff -; GFX10-NEXT: s_lshl_b32 s10, s1, s5 -; GFX10-NEXT: s_xor_b32 s9, s0, s7 -; GFX10-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s8, s1, 16 +; GFX10-NEXT: s_lshl_b32 s7, s1, s5 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s7 clamp +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-NEXT: s_cmp_lt_u32 s9, s10 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_lshl_b32 s2, s2, s5 -; GFX10-NEXT: s_add_i32 s0, s0, s9 -; GFX10-NEXT: s_xor_b32 s9, s2, s7 -; GFX10-NEXT: s_lshl_b32 s6, s6, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s5 -; GFX10-NEXT: s_cmp_lt_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s6, s9, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 -; GFX10-NEXT: s_add_i32 s2, s2, s6 -; GFX10-NEXT: s_xor_b32 s6, s3, s7 -; GFX10-NEXT: s_lshl_b32 s8, s8, s5 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_lshr_b32 s2, s2, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_add_i32 s3, s3, s6 -; GFX10-NEXT: s_xor_b32 s6, s4, s7 ; GFX10-NEXT: s_lshl_b32 s1, s1, s5 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s1 -; GFX10-NEXT: s_cselect_b32 s1, s6, s1 -; GFX10-NEXT: s_add_i32 s4, s4, s1 -; GFX10-NEXT: s_bfe_u32 s1, s4, 0x100000 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: s_and_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_and_b32 s3, s3, s4 -; GFX10-NEXT: s_and_b32 s1, s1, s4 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_add_nc_u16_e64 v2, s3, s0 clamp +; GFX10-NEXT: v_add_nc_u16_e64 v3, s4, s1 clamp +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -888,9 +683,7 @@ define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -899,9 +692,7 @@ define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -912,9 +703,7 @@ define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX10-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs) @@ -935,36 +724,32 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; ; GFX8-LABEL: s_uaddsat_i24: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_not_b32 s2, s0 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_not_b32 s2, s0 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_not_b32 s2, s0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -982,27 +767,21 @@ define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-LABEL: v_uaddsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1019,27 +798,23 @@ define amdgpu_ps i32 @s_uaddsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: s_uaddsat_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s2, s0 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s2, s0 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s2, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -1055,24 +830,18 @@ define amdgpu_ps float @uaddsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; ; GFX8-LABEL: uaddsat_i32_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s1, s0 -; GFX8-NEXT: v_min_u32_e32 v0, s1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s1, s0 -; GFX9-NEXT: v_min_u32_e32 v0, s1, v0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i32_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s1, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1089,24 +858,18 @@ define amdgpu_ps float @uaddsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: uaddsat_i32_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i32_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1128,36 +891,24 @@ define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v2, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v3 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v1 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX10-NEXT: v_min_u32_e32 v3, v5, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1178,39 +929,31 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX8-LABEL: s_uaddsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s4, s0 -; GFX8-NEXT: s_cmp_lt_u32 s4, s2 -; GFX8-NEXT: s_cselect_b32 s2, s4, s2 -; GFX8-NEXT: s_add_i32 s0, s0, s2 -; GFX8-NEXT: s_not_b32 s2, s1 -; GFX8-NEXT: s_cmp_lt_u32 s2, s3 -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_add_i32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s4, s0 -; GFX9-NEXT: s_cmp_lt_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_not_b32 s2, s1 -; GFX9-NEXT: s_cmp_lt_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s4, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s2 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s4, s2 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_not_b32 s4, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_lt_u32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s2, s4, s3 -; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1234,45 +977,27 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v3, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v5 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v3, v6, v3 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v3, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v3 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v3, v6, v3 -; GFX10-NEXT: v_min_u32_e32 v4, v7, v4 -; GFX10-NEXT: v_min_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -1297,51 +1022,39 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX8-LABEL: s_uaddsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s6, s0 -; GFX8-NEXT: s_cmp_lt_u32 s6, s3 -; GFX8-NEXT: s_cselect_b32 s3, s6, s3 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_not_b32 s3, s1 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s1, s1, s3 -; GFX8-NEXT: s_not_b32 s3, s2 -; GFX8-NEXT: s_cmp_lt_u32 s3, s5 -; GFX8-NEXT: s_cselect_b32 s3, s3, s5 -; GFX8-NEXT: s_add_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s6, s0 -; GFX9-NEXT: s_cmp_lt_u32 s6, s3 -; GFX9-NEXT: s_cselect_b32 s3, s6, s3 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_not_b32 s3, s1 -; GFX9-NEXT: s_cmp_lt_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_not_b32 s3, s2 -; GFX9-NEXT: s_cmp_lt_u32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s6, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s3 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s4 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s6, s3 -; GFX10-NEXT: s_cselect_b32 s3, s6, s3 -; GFX10-NEXT: s_not_b32 s6, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_cmp_lt_u32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s3, s6, s4 -; GFX10-NEXT: s_not_b32 s4, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s3, s4, s5 -; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -1368,54 +1081,30 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v4, v8, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v7 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v4, v8, v4 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v15, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v1 -; GFX10-NEXT: v_xor_b32_e32 v23, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v3 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v11, v15, v4 -; GFX10-NEXT: v_min_u32_e32 v15, v19, v5 -; GFX10-NEXT: v_min_u32_e32 v19, v23, v6 -; GFX10-NEXT: v_min_u32_e32 v6, v10, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v11 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -1444,63 +1133,47 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX8-LABEL: s_uaddsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s8, s0 -; GFX8-NEXT: s_cmp_lt_u32 s8, s4 -; GFX8-NEXT: s_cselect_b32 s4, s8, s4 -; GFX8-NEXT: s_add_i32 s0, s0, s4 -; GFX8-NEXT: s_not_b32 s4, s1 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s1, s1, s4 -; GFX8-NEXT: s_not_b32 s4, s2 -; GFX8-NEXT: s_cmp_lt_u32 s4, s6 -; GFX8-NEXT: s_cselect_b32 s4, s4, s6 -; GFX8-NEXT: s_add_i32 s2, s2, s4 -; GFX8-NEXT: s_not_b32 s4, s3 -; GFX8-NEXT: s_cmp_lt_u32 s4, s7 -; GFX8-NEXT: s_cselect_b32 s4, s4, s7 -; GFX8-NEXT: s_add_i32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u32_e64 v0, s[8:9], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s8, s0 -; GFX9-NEXT: s_cmp_lt_u32 s8, s4 -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_not_b32 s4, s1 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_not_b32 s4, s2 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_not_b32 s4, s3 -; GFX9-NEXT: s_cmp_lt_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s8, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s4 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_not_b32 s8, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_cmp_lt_u32 s8, s5 -; GFX10-NEXT: s_cselect_b32 s4, s8, s5 -; GFX10-NEXT: s_not_b32 s5, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_cmp_lt_u32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s4, s5, s6 -; GFX10-NEXT: s_not_b32 s5, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_u32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s4, s5, s7 -; GFX10-NEXT: s_add_i32 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -1530,62 +1203,32 @@ define <5 x i32> @v_uaddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v10, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v5, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v6 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v8 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v4 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v9 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v7 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v8 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v9 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v10, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v5, v10, v5 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v7 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v8 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v4 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v9 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v7 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v8 clamp +; GFX9-NEXT: v_add_u32_e64 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v5i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v1 -; GFX10-NEXT: v_xor_b32_e32 v12, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v3 -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v4 -; GFX10-NEXT: v_min_u32_e32 v5, v10, v5 -; GFX10-NEXT: v_min_u32_e32 v6, v11, v6 -; GFX10-NEXT: v_min_u32_e32 v7, v12, v7 -; GFX10-NEXT: v_min_u32_e32 v8, v13, v8 -; GFX10-NEXT: v_min_u32_e32 v9, v14, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v9 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v7 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v8 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v4, v4, v9 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) @@ -1619,75 +1262,55 @@ define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX8-LABEL: s_uaddsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s10, s0 -; GFX8-NEXT: s_cmp_lt_u32 s10, s5 -; GFX8-NEXT: s_cselect_b32 s5, s10, s5 -; GFX8-NEXT: s_add_i32 s0, s0, s5 -; GFX8-NEXT: s_not_b32 s5, s1 -; GFX8-NEXT: s_cmp_lt_u32 s5, s6 -; GFX8-NEXT: s_cselect_b32 s5, s5, s6 -; GFX8-NEXT: s_add_i32 s1, s1, s5 -; GFX8-NEXT: s_not_b32 s5, s2 -; GFX8-NEXT: s_cmp_lt_u32 s5, s7 -; GFX8-NEXT: s_cselect_b32 s5, s5, s7 -; GFX8-NEXT: s_add_i32 s2, s2, s5 -; GFX8-NEXT: s_not_b32 s5, s3 -; GFX8-NEXT: s_cmp_lt_u32 s5, s8 -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: s_add_i32 s3, s3, s5 -; GFX8-NEXT: s_not_b32 s5, s4 -; GFX8-NEXT: s_cmp_lt_u32 s5, s9 -; GFX8-NEXT: s_cselect_b32 s5, s5, s9 -; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_add_u32_e64 v0, s[10:11], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], s4, v4 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v5i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s10, s0 -; GFX9-NEXT: s_cmp_lt_u32 s10, s5 -; GFX9-NEXT: s_cselect_b32 s5, s10, s5 -; GFX9-NEXT: s_add_i32 s0, s0, s5 -; GFX9-NEXT: s_not_b32 s5, s1 -; GFX9-NEXT: s_cmp_lt_u32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s1, s1, s5 -; GFX9-NEXT: s_not_b32 s5, s2 -; GFX9-NEXT: s_cmp_lt_u32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_add_i32 s2, s2, s5 -; GFX9-NEXT: s_not_b32 s5, s3 -; GFX9-NEXT: s_cmp_lt_u32 s5, s8 -; GFX9-NEXT: s_cselect_b32 s5, s5, s8 -; GFX9-NEXT: s_add_i32 s3, s3, s5 -; GFX9-NEXT: s_not_b32 s5, s4 -; GFX9-NEXT: s_cmp_lt_u32 s5, s9 -; GFX9-NEXT: s_cselect_b32 s5, s5, s9 -; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v5i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s10, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s5 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s6 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s7 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s8 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v4, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s10, s5 -; GFX10-NEXT: s_cselect_b32 s5, s10, s5 -; GFX10-NEXT: s_not_b32 s10, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s5 -; GFX10-NEXT: s_cmp_lt_u32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s5, s10, s6 -; GFX10-NEXT: s_not_b32 s6, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s7 -; GFX10-NEXT: s_cselect_b32 s5, s6, s7 -; GFX10-NEXT: s_not_b32 s6, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s5, s6, s8 -; GFX10-NEXT: s_not_b32 s6, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_u32 s6, s9 -; GFX10-NEXT: s_cselect_b32 s5, s6, s9 -; GFX10-NEXT: s_add_i32 s4, s4, s5 ; GFX10-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result @@ -1750,162 +1373,66 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-LABEL: v_uaddsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v32, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v16, v32, v16 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v17 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v18 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v19 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v4 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v20 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v5 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v21 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v6 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v22 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v7 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v23 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v8 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v24 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v9 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v25 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v10 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v26 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v12 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v28 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v13 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v29 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v14 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v30 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v15 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v31 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v17 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v18 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v19 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v20 clamp +; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v21 clamp +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v6, v22 clamp +; GFX8-NEXT: v_add_u32_e64 v7, s[4:5], v7, v23 clamp +; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], v8, v24 clamp +; GFX8-NEXT: v_add_u32_e64 v9, s[4:5], v9, v25 clamp +; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], v10, v26 clamp +; GFX8-NEXT: v_add_u32_e64 v11, s[4:5], v11, v27 clamp +; GFX8-NEXT: v_add_u32_e64 v12, s[4:5], v12, v28 clamp +; GFX8-NEXT: v_add_u32_e64 v13, s[4:5], v13, v29 clamp +; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30 clamp +; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v31 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v32, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v16, v32, v16 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v18 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v19 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v4 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v20 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v5 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v21 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v6 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v22 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v7 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v23 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v8 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v24 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v9 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v25 -; GFX9-NEXT: v_add_u32_e32 v9, v9, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v10 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v26 -; GFX9-NEXT: v_add_u32_e32 v10, v10, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX9-NEXT: v_add_u32_e32 v11, v11, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v12 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v28 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v13 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v29 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v14 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v30 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v15 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v31 -; GFX9-NEXT: v_add_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v17 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v18 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v19 clamp +; GFX9-NEXT: v_add_u32_e64 v4, v4, v20 clamp +; GFX9-NEXT: v_add_u32_e64 v5, v5, v21 clamp +; GFX9-NEXT: v_add_u32_e64 v6, v6, v22 clamp +; GFX9-NEXT: v_add_u32_e64 v7, v7, v23 clamp +; GFX9-NEXT: v_add_u32_e64 v8, v8, v24 clamp +; GFX9-NEXT: v_add_u32_e64 v9, v9, v25 clamp +; GFX9-NEXT: v_add_u32_e64 v10, v10, v26 clamp +; GFX9-NEXT: v_add_u32_e64 v11, v11, v27 clamp +; GFX9-NEXT: v_add_u32_e64 v12, v12, v28 clamp +; GFX9-NEXT: v_add_u32_e64 v13, v13, v29 clamp +; GFX9-NEXT: v_add_u32_e64 v14, v14, v30 clamp +; GFX9-NEXT: v_add_u32_e64 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v35, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v32, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v33, -1, v3 -; GFX10-NEXT: v_xor_b32_e32 v34, -1, v4 +; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v35, v35, v16 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v35 -; GFX10-NEXT: v_xor_b32_e32 v35, -1, v5 -; GFX10-NEXT: v_min_u32_e32 v16, v16, v17 -; GFX10-NEXT: v_min_u32_e32 v17, v32, v18 -; GFX10-NEXT: v_min_u32_e32 v18, v33, v19 -; GFX10-NEXT: v_min_u32_e32 v19, v34, v20 -; GFX10-NEXT: v_min_u32_e32 v20, v35, v21 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v16 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v17 -; GFX10-NEXT: v_xor_b32_e32 v17, -1, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v18 -; GFX10-NEXT: v_xor_b32_e32 v18, -1, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v19 -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v20 -; GFX10-NEXT: v_xor_b32_e32 v20, -1, v10 -; GFX10-NEXT: v_min_u32_e32 v16, v16, v22 -; GFX10-NEXT: v_min_u32_e32 v17, v17, v23 -; GFX10-NEXT: v_min_u32_e32 v18, v18, v24 -; GFX10-NEXT: v_min_u32_e32 v19, v19, v25 -; GFX10-NEXT: v_min_u32_e32 v20, v20, v26 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v16 -; GFX10-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v17 -; GFX10-NEXT: v_xor_b32_e32 v17, -1, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v18 -; GFX10-NEXT: v_xor_b32_e32 v18, -1, v13 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v19 -; GFX10-NEXT: v_xor_b32_e32 v19, -1, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v10, v20 -; GFX10-NEXT: v_xor_b32_e32 v20, -1, v15 -; GFX10-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX10-NEXT: v_min_u32_e32 v17, v17, v28 -; GFX10-NEXT: v_min_u32_e32 v18, v18, v29 -; GFX10-NEXT: v_min_u32_e32 v19, v19, v30 -; GFX10-NEXT: v_min_u32_e32 v20, v20, v31 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v16 -; GFX10-NEXT: v_add_nc_u32_e32 v12, v12, v17 -; GFX10-NEXT: v_add_nc_u32_e32 v13, v13, v18 -; GFX10-NEXT: v_add_nc_u32_e32 v14, v14, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v15, v15, v20 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -1982,207 +1509,143 @@ define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX8-LABEL: s_uaddsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s32, s0 -; GFX8-NEXT: s_cmp_lt_u32 s32, s16 -; GFX8-NEXT: s_cselect_b32 s16, s32, s16 -; GFX8-NEXT: s_add_i32 s0, s0, s16 -; GFX8-NEXT: s_not_b32 s16, s1 -; GFX8-NEXT: s_cmp_lt_u32 s16, s17 -; GFX8-NEXT: s_cselect_b32 s16, s16, s17 -; GFX8-NEXT: s_add_i32 s1, s1, s16 -; GFX8-NEXT: s_not_b32 s16, s2 -; GFX8-NEXT: s_cmp_lt_u32 s16, s18 -; GFX8-NEXT: s_cselect_b32 s16, s16, s18 -; GFX8-NEXT: s_add_i32 s2, s2, s16 -; GFX8-NEXT: s_not_b32 s16, s3 -; GFX8-NEXT: s_cmp_lt_u32 s16, s19 -; GFX8-NEXT: s_cselect_b32 s16, s16, s19 -; GFX8-NEXT: s_add_i32 s3, s3, s16 -; GFX8-NEXT: s_not_b32 s16, s4 -; GFX8-NEXT: s_cmp_lt_u32 s16, s20 -; GFX8-NEXT: s_cselect_b32 s16, s16, s20 -; GFX8-NEXT: s_add_i32 s4, s4, s16 -; GFX8-NEXT: s_not_b32 s16, s5 -; GFX8-NEXT: s_cmp_lt_u32 s16, s21 -; GFX8-NEXT: s_cselect_b32 s16, s16, s21 -; GFX8-NEXT: s_add_i32 s5, s5, s16 -; GFX8-NEXT: s_not_b32 s16, s6 -; GFX8-NEXT: s_cmp_lt_u32 s16, s22 -; GFX8-NEXT: s_cselect_b32 s16, s16, s22 -; GFX8-NEXT: s_add_i32 s6, s6, s16 -; GFX8-NEXT: s_not_b32 s16, s7 -; GFX8-NEXT: s_cmp_lt_u32 s16, s23 -; GFX8-NEXT: s_cselect_b32 s16, s16, s23 -; GFX8-NEXT: s_add_i32 s7, s7, s16 -; GFX8-NEXT: s_not_b32 s16, s8 -; GFX8-NEXT: s_cmp_lt_u32 s16, s24 -; GFX8-NEXT: s_cselect_b32 s16, s16, s24 -; GFX8-NEXT: s_add_i32 s8, s8, s16 -; GFX8-NEXT: s_not_b32 s16, s9 -; GFX8-NEXT: s_cmp_lt_u32 s16, s25 -; GFX8-NEXT: s_cselect_b32 s16, s16, s25 -; GFX8-NEXT: s_add_i32 s9, s9, s16 -; GFX8-NEXT: s_not_b32 s16, s10 -; GFX8-NEXT: s_cmp_lt_u32 s16, s26 -; GFX8-NEXT: s_cselect_b32 s16, s16, s26 -; GFX8-NEXT: s_add_i32 s10, s10, s16 -; GFX8-NEXT: s_not_b32 s16, s11 -; GFX8-NEXT: s_cmp_lt_u32 s16, s27 -; GFX8-NEXT: s_cselect_b32 s16, s16, s27 -; GFX8-NEXT: s_add_i32 s11, s11, s16 -; GFX8-NEXT: s_not_b32 s16, s12 -; GFX8-NEXT: s_cmp_lt_u32 s16, s28 -; GFX8-NEXT: s_cselect_b32 s16, s16, s28 -; GFX8-NEXT: s_add_i32 s12, s12, s16 -; GFX8-NEXT: s_not_b32 s16, s13 -; GFX8-NEXT: s_cmp_lt_u32 s16, s29 -; GFX8-NEXT: s_cselect_b32 s16, s16, s29 -; GFX8-NEXT: s_add_i32 s13, s13, s16 -; GFX8-NEXT: s_not_b32 s16, s14 -; GFX8-NEXT: s_cmp_lt_u32 s16, s30 -; GFX8-NEXT: s_cselect_b32 s16, s16, s30 -; GFX8-NEXT: s_add_i32 s14, s14, s16 -; GFX8-NEXT: s_not_b32 s16, s15 -; GFX8-NEXT: s_cmp_lt_u32 s16, s31 -; GFX8-NEXT: s_cselect_b32 s16, s16, s31 -; GFX8-NEXT: s_add_i32 s15, s15, s16 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s25 +; GFX8-NEXT: v_mov_b32_e32 v10, s26 +; GFX8-NEXT: v_mov_b32_e32 v11, s27 +; GFX8-NEXT: v_mov_b32_e32 v12, s28 +; GFX8-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 +; GFX8-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NEXT: v_add_u32_e64 v0, s[32:33], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[16:17], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[16:17], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[2:3], s3, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], s4, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v5, s[2:3], s5, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v6, s[2:3], s6, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], s7, v7 clamp +; GFX8-NEXT: v_add_u32_e64 v8, s[2:3], s8, v8 clamp +; GFX8-NEXT: v_add_u32_e64 v9, s[2:3], s9, v9 clamp +; GFX8-NEXT: v_add_u32_e64 v10, s[2:3], s10, v10 clamp +; GFX8-NEXT: v_add_u32_e64 v11, s[2:3], s11, v11 clamp +; GFX8-NEXT: v_add_u32_e64 v12, s[2:3], s12, v12 clamp +; GFX8-NEXT: v_add_u32_e64 v13, s[2:3], s13, v13 clamp +; GFX8-NEXT: v_add_u32_e64 v14, s[2:3], s14, v14 clamp +; GFX8-NEXT: v_add_u32_e64 v15, s[2:3], s15, v15 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8-NEXT: v_readfirstlane_b32 s6, v6 +; GFX8-NEXT: v_readfirstlane_b32 s7, v7 +; GFX8-NEXT: v_readfirstlane_b32 s8, v8 +; GFX8-NEXT: v_readfirstlane_b32 s9, v9 +; GFX8-NEXT: v_readfirstlane_b32 s10, v10 +; GFX8-NEXT: v_readfirstlane_b32 s11, v11 +; GFX8-NEXT: v_readfirstlane_b32 s12, v12 +; GFX8-NEXT: v_readfirstlane_b32 s13, v13 +; GFX8-NEXT: v_readfirstlane_b32 s14, v14 +; GFX8-NEXT: v_readfirstlane_b32 s15, v15 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v16i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s32, s0 -; GFX9-NEXT: s_cmp_lt_u32 s32, s16 -; GFX9-NEXT: s_cselect_b32 s16, s32, s16 -; GFX9-NEXT: s_add_i32 s0, s0, s16 -; GFX9-NEXT: s_not_b32 s16, s1 -; GFX9-NEXT: s_cmp_lt_u32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_add_i32 s1, s1, s16 -; GFX9-NEXT: s_not_b32 s16, s2 -; GFX9-NEXT: s_cmp_lt_u32 s16, s18 -; GFX9-NEXT: s_cselect_b32 s16, s16, s18 -; GFX9-NEXT: s_add_i32 s2, s2, s16 -; GFX9-NEXT: s_not_b32 s16, s3 -; GFX9-NEXT: s_cmp_lt_u32 s16, s19 -; GFX9-NEXT: s_cselect_b32 s16, s16, s19 -; GFX9-NEXT: s_add_i32 s3, s3, s16 -; GFX9-NEXT: s_not_b32 s16, s4 -; GFX9-NEXT: s_cmp_lt_u32 s16, s20 -; GFX9-NEXT: s_cselect_b32 s16, s16, s20 -; GFX9-NEXT: s_add_i32 s4, s4, s16 -; GFX9-NEXT: s_not_b32 s16, s5 -; GFX9-NEXT: s_cmp_lt_u32 s16, s21 -; GFX9-NEXT: s_cselect_b32 s16, s16, s21 -; GFX9-NEXT: s_add_i32 s5, s5, s16 -; GFX9-NEXT: s_not_b32 s16, s6 -; GFX9-NEXT: s_cmp_lt_u32 s16, s22 -; GFX9-NEXT: s_cselect_b32 s16, s16, s22 -; GFX9-NEXT: s_add_i32 s6, s6, s16 -; GFX9-NEXT: s_not_b32 s16, s7 -; GFX9-NEXT: s_cmp_lt_u32 s16, s23 -; GFX9-NEXT: s_cselect_b32 s16, s16, s23 -; GFX9-NEXT: s_add_i32 s7, s7, s16 -; GFX9-NEXT: s_not_b32 s16, s8 -; GFX9-NEXT: s_cmp_lt_u32 s16, s24 -; GFX9-NEXT: s_cselect_b32 s16, s16, s24 -; GFX9-NEXT: s_add_i32 s8, s8, s16 -; GFX9-NEXT: s_not_b32 s16, s9 -; GFX9-NEXT: s_cmp_lt_u32 s16, s25 -; GFX9-NEXT: s_cselect_b32 s16, s16, s25 -; GFX9-NEXT: s_add_i32 s9, s9, s16 -; GFX9-NEXT: s_not_b32 s16, s10 -; GFX9-NEXT: s_cmp_lt_u32 s16, s26 -; GFX9-NEXT: s_cselect_b32 s16, s16, s26 -; GFX9-NEXT: s_add_i32 s10, s10, s16 -; GFX9-NEXT: s_not_b32 s16, s11 -; GFX9-NEXT: s_cmp_lt_u32 s16, s27 -; GFX9-NEXT: s_cselect_b32 s16, s16, s27 -; GFX9-NEXT: s_add_i32 s11, s11, s16 -; GFX9-NEXT: s_not_b32 s16, s12 -; GFX9-NEXT: s_cmp_lt_u32 s16, s28 -; GFX9-NEXT: s_cselect_b32 s16, s16, s28 -; GFX9-NEXT: s_add_i32 s12, s12, s16 -; GFX9-NEXT: s_not_b32 s16, s13 -; GFX9-NEXT: s_cmp_lt_u32 s16, s29 -; GFX9-NEXT: s_cselect_b32 s16, s16, s29 -; GFX9-NEXT: s_add_i32 s13, s13, s16 -; GFX9-NEXT: s_not_b32 s16, s14 -; GFX9-NEXT: s_cmp_lt_u32 s16, s30 -; GFX9-NEXT: s_cselect_b32 s16, s16, s30 -; GFX9-NEXT: s_add_i32 s14, s14, s16 -; GFX9-NEXT: s_not_b32 s16, s15 -; GFX9-NEXT: s_cmp_lt_u32 s16, s31 -; GFX9-NEXT: s_cselect_b32 s16, s16, s31 -; GFX9-NEXT: s_add_i32 s15, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v5, s5, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v6, s6, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v7, s7, v7 clamp +; GFX9-NEXT: v_add_u32_e64 v8, s8, v8 clamp +; GFX9-NEXT: v_add_u32_e64 v9, s9, v9 clamp +; GFX9-NEXT: v_add_u32_e64 v10, s10, v10 clamp +; GFX9-NEXT: v_add_u32_e64 v11, s11, v11 clamp +; GFX9-NEXT: v_add_u32_e64 v12, s12, v12 clamp +; GFX9-NEXT: v_add_u32_e64 v13, s13, v13 clamp +; GFX9-NEXT: v_add_u32_e64 v14, s14, v14 clamp +; GFX9-NEXT: v_add_u32_e64 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v16i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s46, s0 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s16 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s17 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s18 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s19 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v4, s4, s20 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v5, s5, s21 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v6, s6, s22 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v7, s7, s23 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v8, s8, s24 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v9, s9, s25 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v10, s10, s26 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v11, s11, s27 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v12, s12, s28 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v13, s13, s29 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v14, s14, s30 clamp +; GFX10-NEXT: v_add_nc_u32_e64 v15, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 +; GFX10-NEXT: v_readfirstlane_b32 s14, v14 +; GFX10-NEXT: v_readfirstlane_b32 s15, v15 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s46, s16 -; GFX10-NEXT: s_cselect_b32 s46, s46, s16 -; GFX10-NEXT: s_not_b32 s47, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s46 -; GFX10-NEXT: s_cmp_lt_u32 s47, s17 -; GFX10-NEXT: s_cselect_b32 s46, s47, s17 -; GFX10-NEXT: s_not_b32 s17, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s46 -; GFX10-NEXT: s_cmp_lt_u32 s17, s18 -; GFX10-NEXT: s_cselect_b32 s16, s17, s18 -; GFX10-NEXT: s_not_b32 s17, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s19 -; GFX10-NEXT: s_cselect_b32 s16, s17, s19 -; GFX10-NEXT: s_not_b32 s17, s4 -; GFX10-NEXT: s_add_i32 s3, s3, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s20 -; GFX10-NEXT: s_cselect_b32 s16, s17, s20 -; GFX10-NEXT: s_not_b32 s17, s5 -; GFX10-NEXT: s_add_i32 s4, s4, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s21 -; GFX10-NEXT: s_cselect_b32 s16, s17, s21 -; GFX10-NEXT: s_not_b32 s17, s6 -; GFX10-NEXT: s_add_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s22 -; GFX10-NEXT: s_cselect_b32 s16, s17, s22 -; GFX10-NEXT: s_not_b32 s17, s7 -; GFX10-NEXT: s_add_i32 s6, s6, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s23 -; GFX10-NEXT: s_cselect_b32 s16, s17, s23 -; GFX10-NEXT: s_not_b32 s17, s8 -; GFX10-NEXT: s_add_i32 s7, s7, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s24 -; GFX10-NEXT: s_cselect_b32 s16, s17, s24 -; GFX10-NEXT: s_not_b32 s17, s9 -; GFX10-NEXT: s_add_i32 s8, s8, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s25 -; GFX10-NEXT: s_cselect_b32 s16, s17, s25 -; GFX10-NEXT: s_not_b32 s17, s10 -; GFX10-NEXT: s_add_i32 s9, s9, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s26 -; GFX10-NEXT: s_cselect_b32 s16, s17, s26 -; GFX10-NEXT: s_not_b32 s17, s11 -; GFX10-NEXT: s_add_i32 s10, s10, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s27 -; GFX10-NEXT: s_cselect_b32 s16, s17, s27 -; GFX10-NEXT: s_not_b32 s17, s12 -; GFX10-NEXT: s_add_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s28 -; GFX10-NEXT: s_cselect_b32 s16, s17, s28 -; GFX10-NEXT: s_not_b32 s17, s13 -; GFX10-NEXT: s_add_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s29 -; GFX10-NEXT: s_cselect_b32 s16, s17, s29 -; GFX10-NEXT: s_not_b32 s17, s14 -; GFX10-NEXT: s_add_i32 s13, s13, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s30 -; GFX10-NEXT: s_cselect_b32 s16, s17, s30 -; GFX10-NEXT: s_not_b32 s17, s15 -; GFX10-NEXT: s_add_i32 s14, s14, s16 -; GFX10-NEXT: s_cmp_lt_u32 s17, s31 -; GFX10-NEXT: s_cselect_b32 s16, s17, s31 -; GFX10-NEXT: s_add_i32 s15, s15, s16 ; GFX10-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -2203,27 +1666,21 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-LABEL: v_uaddsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0xffff, v0 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -2243,33 +1700,23 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: s_uaddsat_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s2, s0, 0xffff -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s2, s0, 0xffff -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s2, s0, 0xffff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -2288,24 +1735,18 @@ define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX8-LABEL: uaddsat_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s1, s0, 0xffff -; GFX8-NEXT: v_min_u16_e32 v0, s1, v0 -; GFX8-NEXT: v_add_u16_e32 v0, s0, v0 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s1, s0, 0xffff -; GFX9-NEXT: v_min_u16_e32 v0, s1, v0 -; GFX9-NEXT: v_add_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s1, s0, 0xffff +; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v0, s1, v0 -; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -2325,24 +1766,18 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: uaddsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff, v0 -; GFX8-NEXT: v_min_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_min_u16_e32 v1, s0, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v1, s0 -; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -2370,35 +1805,25 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v2 -; GFX8-NEXT: v_min_u16_e32 v3, v3, v1 -; GFX8-NEXT: v_min_u16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v2, v0, v1 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v0 -; GFX9-NEXT: v_pk_min_u16 v1, v2, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, s4, v0 -; GFX10-NEXT: v_pk_min_u16 v1, v2, v1 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result @@ -2430,68 +1855,30 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX8-LABEL: s_uaddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: s_xor_b32 s5, s0, s4 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s1 -; GFX8-NEXT: s_cselect_b32 s1, s5, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_xor_b32 s1, s2, s4 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s1, s3 -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v1, s2, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s2, -1, -1 -; GFX9-NEXT: s_xor_b32 s2, s0, s2 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_cmp_lt_u32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, -1, -1 -; GFX10-NEXT: s_mov_b32 s3, 0xffff -; GFX10-NEXT: s_xor_b32 s2, s0, s2 +; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: s_cmp_lt_u32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_cmp_lt_u32 s4, s1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to i32 @@ -2522,34 +1909,24 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX8-LABEL: uaddsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_xor_b32 s3, s0, s2 -; GFX8-NEXT: s_xor_b32 s2, s1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_min_u16_e32 v1, s3, v0 -; GFX8-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_add_u16_e64 v1, s0, v0 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX9-NEXT: s_xor_b32 s1, s0, s1 -; GFX9-NEXT: v_pk_min_u16 v0, s1, v0 -; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 +; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_xor_b32 s1, s0, s1 -; GFX10-NEXT: v_pk_min_u16 v0, s1, v0 -; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -2580,33 +1957,24 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: uaddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, s2, v0 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v1 -; GFX8-NEXT: v_min_u16_e32 v2, s0, v2 -; GFX8-NEXT: v_min_u16_e32 v3, s1, v3 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u16_e64 v1, v0, s0 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v0 -; GFX9-NEXT: v_pk_min_u16 v1, v1, s0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 +; GFX10-NEXT: v_pk_add_u16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v0 -; GFX10-NEXT: v_pk_min_u16 v1, v1, s0 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -2666,49 +2034,31 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v4 -; GFX8-NEXT: v_min_u16_e32 v6, v6, v2 -; GFX8-NEXT: v_min_u16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v8, s4, v5 -; GFX8-NEXT: v_min_u16_e32 v7, v7, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_add_u16_e64 v4, v0, v2 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v2, v1, v3 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v0 -; GFX9-NEXT: v_pk_min_u16 v2, v4, v2 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v1 -; GFX9-NEXT: v_pk_min_u16 v2, v2, v3 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v4, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v5, s4, v1 -; GFX10-NEXT: v_pk_min_u16 v2, v4, v2 -; GFX10-NEXT: v_pk_min_u16 v3, v5, v3 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> @@ -2759,116 +2109,44 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s8, 0xffff -; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s9, s2 -; GFX8-NEXT: s_cselect_b32 s2, s9, s2 -; GFX8-NEXT: s_add_i32 s0, s0, s2 -; GFX8-NEXT: s_xor_b32 s2, s4, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s2, s6 -; GFX8-NEXT: s_add_i32 s4, s4, s2 -; GFX8-NEXT: s_xor_b32 s2, s1, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s3 -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s2, s5, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s3 -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_add_i32 s5, s5, s2 -; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s5, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: s_xor_b32 s5, s0, s4 -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_lshr_b32 s8, s2, 16 -; GFX9-NEXT: s_and_b32 s5, s5, s7 -; GFX9-NEXT: s_and_b32 s2, s2, s7 -; GFX9-NEXT: s_cmp_lt_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_cmp_lt_u32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s5, s6, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_xor_b32 s2, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s3, s3, s7 -; GFX9-NEXT: s_cmp_lt_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_mov_b32 s6, 0xffff -; GFX10-NEXT: s_xor_b32 s5, s0, s4 -; GFX10-NEXT: s_and_b32 s8, s2, s6 -; GFX10-NEXT: s_lshr_b32 s7, s5, 16 -; GFX10-NEXT: s_and_b32 s5, s5, s6 -; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_u32 s5, s8 +; GFX10-NEXT: v_pk_add_u16 v0, s0, s2 clamp +; GFX10-NEXT: v_pk_add_u16 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s5, s8 -; GFX10-NEXT: s_cmp_lt_u32 s7, s2 -; GFX10-NEXT: s_cselect_b32 s2, s7, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_lshr_b32 s7, s2, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_xor_b32 s2, s1, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s7 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s6, s3, s6 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s2, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cmp_lt_u32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -2944,63 +2222,38 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v10, s4, v6 -; GFX8-NEXT: v_min_u16_e32 v9, v9, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v10, s4, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_xor_b32_e32 v11, s4, v7 -; GFX8-NEXT: v_min_u16_e32 v10, v10, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v11, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v12, s4, v8 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v11, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u16_e64 v6, v0, v3 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v3, v1, v4 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v4, v2, v5 clamp +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v0 -; GFX9-NEXT: v_pk_min_u16 v3, v6, v3 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_pk_min_u16 v3, v3, v4 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v2 -; GFX9-NEXT: v_pk_min_u16 v3, v3, v5 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v6i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 clamp +; GFX10-NEXT: v_pk_add_u16 v1, v1, v4 clamp +; GFX10-NEXT: v_pk_add_u16 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v6, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, s4, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, s4, v2 -; GFX10-NEXT: v_pk_min_u16 v3, v6, v3 -; GFX10-NEXT: v_pk_min_u16 v4, v7, v4 -; GFX10-NEXT: v_pk_min_u16 v5, v8, v5 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX10-NEXT: v_pk_add_u16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x float> @@ -3069,164 +2322,58 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff -; GFX8-NEXT: s_xor_b32 s13, s0, s12 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_add_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_bfe_u32 s13, s13, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s13, s3 -; GFX8-NEXT: s_cselect_b32 s3, s13, s3 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_xor_b32 s3, s6, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s9 -; GFX8-NEXT: s_cselect_b32 s3, s3, s9 -; GFX8-NEXT: s_add_i32 s6, s6, s3 -; GFX8-NEXT: s_xor_b32 s3, s1, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s1, s1, s3 -; GFX8-NEXT: s_xor_b32 s3, s7, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s7, s7, s3 -; GFX8-NEXT: s_xor_b32 s3, s2, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_xor_b32 s3, s8, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s8, s8, s3 -; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v5, s8, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 16 -; GFX9-NEXT: s_and_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_cmp_lt_u32 s7, s3 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 -; GFX9-NEXT: s_cmp_lt_u32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s7, s8, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s7 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s7, s7, s8 -; GFX9-NEXT: s_xor_b32 s3, s1, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_and_b32 s4, s4, s9 -; GFX9-NEXT: s_cmp_lt_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_u32 s7, s8 -; GFX9-NEXT: s_cselect_b32 s4, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_xor_b32 s3, s2, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_and_b32 s5, s5, s9 -; GFX9-NEXT: s_cmp_lt_u32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_xor_b32 s7, s0, s6 -; GFX10-NEXT: s_and_b32 s10, s3, s8 -; GFX10-NEXT: s_lshr_b32 s9, s7, 16 -; GFX10-NEXT: s_and_b32 s7, s7, s8 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s7, s10 +; GFX10-NEXT: v_pk_add_u16 v0, s0, s3 clamp +; GFX10-NEXT: v_pk_add_u16 v1, s1, s4 clamp +; GFX10-NEXT: v_pk_add_u16 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s7, s10 -; GFX10-NEXT: s_cmp_lt_u32 s9, s3 -; GFX10-NEXT: s_cselect_b32 s3, s9, s3 -; GFX10-NEXT: s_and_b32 s10, s4, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s7, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_xor_b32 s3, s1, s6 -; GFX10-NEXT: s_add_i32 s7, s7, s9 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s10 -; GFX10-NEXT: s_cmp_lt_u32 s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_xor_b32 s3, s2, s6 -; GFX10-NEXT: s_add_i32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 -; GFX10-NEXT: s_and_b32 s8, s5, s8 -; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s8 -; GFX10-NEXT: s_cmp_lt_u32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_add_i32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -3307,77 +2454,44 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_uaddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v12, s4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v13, s4, v8 -; GFX8-NEXT: v_min_u16_e32 v12, v12, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v13, s4, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_xor_b32_e32 v14, s4, v9 -; GFX8-NEXT: v_min_u16_e32 v13, v13, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v14, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v15, s4, v10 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v14, v14, v6 -; GFX8-NEXT: v_min_u16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v15, s4, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_xor_b32_e32 v16, s4, v11 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v15, v15, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_min_u16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_u16_e64 v8, v0, v4 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v4, v1, v5 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v5, v2, v6 clamp +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v6, v3, v7 clamp +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v8, s4, v0 -; GFX9-NEXT: v_pk_min_u16 v4, v8, v4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v1 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v5 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v2 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v6 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v3 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v7 -; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_pk_add_u16 v0, v0, v4 clamp +; GFX10-NEXT: v_pk_add_u16 v1, v1, v5 clamp +; GFX10-NEXT: v_pk_add_u16 v2, v2, v6 clamp +; GFX10-NEXT: v_pk_add_u16 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v15, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v19, s4, v1 -; GFX10-NEXT: v_xor_b32_e32 v23, s4, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, s4, v3 -; GFX10-NEXT: v_pk_min_u16 v11, v15, v4 -; GFX10-NEXT: v_pk_min_u16 v15, v19, v5 -; GFX10-NEXT: v_pk_min_u16 v19, v23, v6 -; GFX10-NEXT: v_pk_min_u16 v6, v10, v7 -; GFX10-NEXT: v_pk_add_u16 v0, v0, v11 -; GFX10-NEXT: v_pk_add_u16 v1, v1, v15 -; GFX10-NEXT: v_pk_add_u16 v2, v2, v19 -; GFX10-NEXT: v_pk_add_u16 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x float> @@ -3464,212 +2578,72 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX8-LABEL: s_uaddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s16, 0xffff -; GFX8-NEXT: s_xor_b32 s17, s0, s16 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_add_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_bfe_u32 s17, s17, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s17, s4 -; GFX8-NEXT: s_cselect_b32 s4, s17, s4 -; GFX8-NEXT: s_add_i32 s0, s0, s4 -; GFX8-NEXT: s_xor_b32 s4, s8, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s12 -; GFX8-NEXT: s_cselect_b32 s4, s4, s12 -; GFX8-NEXT: s_add_i32 s8, s8, s4 -; GFX8-NEXT: s_xor_b32 s4, s1, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s1, s1, s4 -; GFX8-NEXT: s_xor_b32 s4, s9, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s13, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s9, s9, s4 -; GFX8-NEXT: s_xor_b32 s4, s2, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s2, s2, s4 -; GFX8-NEXT: s_xor_b32 s4, s10, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s14, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s10, s10, s4 -; GFX8-NEXT: s_xor_b32 s4, s3, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_xor_b32 s4, s11, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s15, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s11, s11, s4 -; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v7, s11, v7 clamp +; GFX8-NEXT: v_add_u16_e64 v5, s10, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_mov_b32 s11, 0xffff -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 -; GFX9-NEXT: s_and_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_cmp_lt_u32 s9, s4 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cmp_lt_u32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s9, s10, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s9 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s9, s9, s10 -; GFX9-NEXT: s_xor_b32 s4, s1, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s10, s5, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s5, s5, s11 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_u32 s9, s10 -; GFX9-NEXT: s_cselect_b32 s5, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s4, s2, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s9, s6, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s6, s6, s11 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_u32 s5, s9 -; GFX9-NEXT: s_cselect_b32 s5, s5, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_xor_b32 s4, s3, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s7, s7, s11 -; GFX9-NEXT: s_cmp_lt_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_lt_u32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_add_u16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_uaddsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX10-NEXT: s_mov_b32 s10, 0xffff -; GFX10-NEXT: s_xor_b32 s9, s0, s8 -; GFX10-NEXT: s_and_b32 s12, s4, s10 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 -; GFX10-NEXT: s_and_b32 s9, s9, s10 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s9, s12 +; GFX10-NEXT: v_pk_add_u16 v0, s0, s4 clamp +; GFX10-NEXT: v_pk_add_u16 v1, s1, s5 clamp +; GFX10-NEXT: v_pk_add_u16 v2, s2, s6 clamp +; GFX10-NEXT: v_pk_add_u16 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s9, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s4 -; GFX10-NEXT: s_cselect_b32 s4, s11, s4 -; GFX10-NEXT: s_and_b32 s12, s5, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s9, s4 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_xor_b32 s4, s1, s8 -; GFX10-NEXT: s_add_i32 s9, s9, s11 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 -; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s5 -; GFX10-NEXT: s_cselect_b32 s5, s11, s5 -; GFX10-NEXT: s_and_b32 s12, s6, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s4, s2, s8 -; GFX10-NEXT: s_add_i32 s5, s5, s11 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 -; GFX10-NEXT: s_lshr_b32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s6 -; GFX10-NEXT: s_cselect_b32 s6, s11, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_xor_b32 s4, s3, s8 -; GFX10-NEXT: s_add_i32 s6, s6, s11 -; GFX10-NEXT: s_lshr_b32 s8, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 -; GFX10-NEXT: s_and_b32 s10, s7, s10 -; GFX10-NEXT: s_lshr_b32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s10 -; GFX10-NEXT: s_cmp_lt_u32 s8, s7 -; GFX10-NEXT: s_cselect_b32 s7, s8, s7 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 -; GFX10-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll index 25eafb45f9309..54eebc9205796 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -311,7 +311,7 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0xfffff000 +; CGP-NEXT: s_movk_i32 s5, 0xf000 ; CGP-NEXT: v_mov_b32_e32 v3, 0xfffff000 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, s4 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index e956af93bc6f1..59b4318012e4c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -185,14 +185,13 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_udiv_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 -; CHECK-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] -; CHECK-NEXT: s_mov_b32 s4, 1 +; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: @@ -324,9 +323,9 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: BB1_2: ; %Flow -; CHECK-NEXT: s_and_b32 s1, s4, 1 +; CHECK-NEXT: s_and_b32 s1, s5, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: s_cbranch_scc0 BB1_4 ; CHECK-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 68a83a91c62f8..f331deea89e54 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -286,7 +286,7 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: s_movk_i32 s4, 0x1000 ; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 ; CGP-NEXT: s_mov_b32 s5, 0x4f7ffffe -; CGP-NEXT: s_mov_b32 s6, 0xfffff000 +; CGP-NEXT: s_movk_i32 s6, 0xf000 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, s4 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 6b9357043b3ca..a01ba29cc3276 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -182,14 +182,13 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_urem_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 -; CHECK-NEXT: s_xor_b64 vcc, s[4:5], s[6:7] -; CHECK-NEXT: s_mov_b32 s4, 1 +; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e64 s[6:7], s[6:7], 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; CHECK-NEXT: s_xor_b64 vcc, s[6:7], s[8:9] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: @@ -320,9 +319,9 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: BB1_2: ; %Flow -; CHECK-NEXT: s_and_b32 s1, s4, 1 +; CHECK-NEXT: s_and_b32 s1, s5, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: s_cbranch_scc0 BB1_4 ; CHECK-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index b111fd31851cf..8553853ff00c2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -20,8 +20,7 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -30,8 +29,7 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -42,8 +40,7 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v0, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs) @@ -66,13 +63,10 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i7: @@ -80,28 +74,21 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs) ret i7 %result @@ -123,8 +110,7 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -133,8 +119,7 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -145,8 +130,7 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v0, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) @@ -169,13 +153,10 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i8: @@ -183,28 +164,21 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result @@ -241,11 +215,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v3, v2 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -259,11 +231,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v3 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_sub_u16_e64 v1, v2, v3 clamp ; GFX9-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -280,10 +250,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0 ; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_min_u16_e64 v3, v0, v3 -; GFX10-NEXT: v_sub_nc_u16_e64 v1, v2, v1 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v3 +; GFX10-NEXT: v_sub_nc_u16_e64 v1, v2, v1 clamp +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v3 clamp ; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -322,92 +290,60 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-LABEL: s_usubsat_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s1 -; GFX8-NEXT: s_cselect_b32 s1, s5, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s2, s3, s4 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s3, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s2 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_sub_i32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: s_lshr_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s3, s4 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: s_lshl_b32 s0, s2, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff +; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s4, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s2, s3, s4 -; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_bfe_u32 s3, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s2 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_lshr_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: s_lshl_b32 s0, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_sub_u16_e64 v1, s0, v1 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshl_b32 s4, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 -; GFX10-NEXT: s_cmp_lt_u32 s5, s4 +; GFX10-NEXT: s_lshr_b32 s2, s0, 8 +; GFX10-NEXT: s_bfe_u32 s3, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_sub_nc_u16_e64 v1, s2, s3 clamp +; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_cmp_lt_u32 s4, s1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_sub_i32 s1, s3, s1 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_and_b32 s0, s0, s3 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> @@ -471,18 +407,14 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v3, v2 -; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_min_u16_e32 v3, v2, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 +; GFX8-NEXT: v_sub_u16_e64 v2, v2, v3 clamp ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_min_u16_e32 v4, v3, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v4 +; GFX8-NEXT: v_sub_u16_e64 v3, v3, v4 clamp ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -506,22 +438,18 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v5 -; GFX9-NEXT: v_sub_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX9-NEXT: v_sub_u16_e64 v1, v2, v5 clamp ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_min_u16_e32 v3, v2, v3 -; GFX9-NEXT: v_sub_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_min_u16_e32 v4, v3, v4 ; GFX9-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_e64 v2, v2, v3 clamp +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_sub_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_u16_e64 v3, v3, v4 clamp ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v2, v3, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -533,32 +461,28 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshlrev_b16_e64 v4, 8, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16_e64 v5, 8, v1 -; GFX10-NEXT: s_mov_b32 s4, 16 -; GFX10-NEXT: s_mov_b32 s5, 24 -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_u16_e64 v3, v2, v3 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_min_u16_e64 v5, v4, v5 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, v3 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_min_u16_e64 v3, v6, v7 -; GFX10-NEXT: v_sub_nc_u16_e64 v4, v4, v5 -; GFX10-NEXT: v_min_u16_e64 v1, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v6, 8, v1 +; GFX10-NEXT: s_mov_b32 s5, 16 +; GFX10-NEXT: s_mov_b32 s4, 24 +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, v3 clamp +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_movk_i32 s5, 0xff +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, s4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u16_e64 v5, v5, v6 clamp +; GFX10-NEXT: v_sub_nc_u16_e64 v3, v4, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v6, v3 -; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v4 -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_and_b32_sdwa v1, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v0, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v2, v4, s4, v2 +; GFX10-NEXT: v_lshrrev_b16_e64 v4, 8, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v2, v4, s5, v2 ; GFX10-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -617,176 +541,107 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-LABEL: s_usubsat_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s8 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s9, s1 -; GFX8-NEXT: s_cselect_b32 s1, s9, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s2, s8 -; GFX8-NEXT: s_lshl_b32 s2, s5, s8 -; GFX8-NEXT: s_lshr_b32 s0, s0, s8 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s2 -; GFX8-NEXT: s_cselect_b32 s2, s5, s2 -; GFX8-NEXT: s_sub_i32 s1, s1, s2 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s3, s8 -; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_lshr_b32 s1, s1, s8 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s3 -; GFX8-NEXT: s_cselect_b32 s3, s5, s3 -; GFX8-NEXT: s_sub_i32 s2, s2, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s4, s8 -; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_lshr_b32 s2, s2, s8 -; GFX8-NEXT: s_bfe_u32 s5, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s4 -; GFX8-NEXT: s_cselect_b32 s4, s5, s4 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_and_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_lshr_b32 s3, s3, s8 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s5, s8 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s0, s2, s8 +; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: s_lshl_b32 s1, s6, s8 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshl_b32 s0, s3, s8 +; GFX8-NEXT: s_lshl_b32 s1, s7, s8 +; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp +; GFX8-NEXT: s_lshl_b32 s0, s4, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s8, 8, 0x100000 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 ; GFX9-NEXT: s_lshr_b32 s5, s1, 8 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s9, s1 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshl_b32 s1, s2, s8 -; GFX9-NEXT: s_lshl_b32 s2, s5, s8 -; GFX9-NEXT: s_lshr_b32 s0, s0, s8 -; GFX9-NEXT: s_bfe_u32 s5, s1, 0x100000 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_lshl_b32 s2, s3, s8 -; GFX9-NEXT: s_lshl_b32 s3, s6, s8 -; GFX9-NEXT: s_lshr_b32 s1, s1, s8 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x100000 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s3 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_lshl_b32 s3, s4, s8 -; GFX9-NEXT: s_lshl_b32 s4, s7, s8 -; GFX9-NEXT: s_lshr_b32 s2, s2, s8 -; GFX9-NEXT: s_bfe_u32 s5, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 -; GFX9-NEXT: s_lshr_b32 s3, s3, s8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s4 -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s2, s8 +; GFX9-NEXT: s_lshl_b32 s1, s6, s8 +; GFX9-NEXT: v_sub_u16_e64 v1, s0, v1 clamp +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s3, s8 +; GFX9-NEXT: s_lshl_b32 s1, s7, s8 +; GFX9-NEXT: v_sub_u16_e64 v2, s0, v2 clamp +; GFX9-NEXT: s_lshl_b32 s0, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_sub_u16_e64 v3, s0, v3 clamp +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s6, 8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 -; GFX10-NEXT: s_lshl_b32 s8, s1, s6 +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, s6, s5 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_sub_nc_u16_e64 v1, s2, s6 clamp ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s9, s0, 0x100000 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_movk_i32 s2, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s7, s1, s5 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s7 clamp +; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-NEXT: s_cmp_lt_u32 s9, s8 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v2, s3, s0 clamp +; GFX10-NEXT: v_sub_nc_u16_e64 v3, s4, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s9, s8 -; GFX10-NEXT: s_lshl_b32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s0, s0, s8 -; GFX10-NEXT: s_lshl_b32 s2, s2, s6 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s2, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshr_b32 s0, s0, s6 -; GFX10-NEXT: s_cmp_lt_u32 s8, s5 -; GFX10-NEXT: s_cselect_b32 s5, s8, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s5, s7, s6 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_cmp_lt_u32 s7, s5 -; GFX10-NEXT: s_cselect_b32 s5, s7, s5 -; GFX10-NEXT: s_lshl_b32 s1, s1, s6 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 -; GFX10-NEXT: s_lshl_b32 s4, s4, s6 -; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s4, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s3, s3, s6 -; GFX10-NEXT: s_cmp_lt_u32 s5, s1 -; GFX10-NEXT: s_cselect_b32 s1, s5, s1 -; GFX10-NEXT: s_sub_i32 s1, s4, s1 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: s_lshr_b32 s1, s1, s6 -; GFX10-NEXT: s_and_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_and_b32 s3, s3, s4 -; GFX10-NEXT: s_and_b32 s1, s1, s4 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, s2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -811,8 +666,7 @@ define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -821,8 +675,7 @@ define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -833,8 +686,7 @@ define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) { ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs) @@ -854,22 +706,22 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; ; GFX8-LABEL: s_usubsat_i24: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_cmp_lt_u32 s0, s1 -; GFX8-NEXT: s_cselect_b32 s1, s0, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_cmp_lt_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s1, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i24: @@ -877,10 +729,9 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s0, s1 -; GFX10-NEXT: s_cselect_b32 s1, s0, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs) ret i24 %result @@ -897,24 +748,21 @@ define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) { ; GFX8-LABEL: v_usubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -930,24 +778,23 @@ define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: s_usubsat_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s1 -; GFX8-NEXT: s_cselect_b32 s1, s0, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s1 -; GFX9-NEXT: s_cselect_b32 s1, s0, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s1 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s1, s0, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result @@ -962,21 +809,18 @@ define amdgpu_ps float @usubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; ; GFX8-LABEL: usubsat_i32_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_i32_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -992,21 +836,18 @@ define amdgpu_ps float @usubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; ; GFX8-LABEL: usubsat_i32_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u32_e32 v1, s0, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u32_e32 v1, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_i32_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u32_e32 v1, s0, v0 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) %cast = bitcast i32 %result to float @@ -1026,30 +867,24 @@ define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_min_u32_e32 v2, v1, v3 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_min_u32_e32 v2, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX10-NEXT: v_min_u32_e32 v3, v1, v3 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1068,33 +903,31 @@ define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX8-LABEL: s_usubsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s2 -; GFX8-NEXT: s_cselect_b32 s2, s0, s2 -; GFX8-NEXT: s_sub_i32 s0, s0, s2 -; GFX8-NEXT: s_cmp_lt_u32 s1, s3 -; GFX8-NEXT: s_cselect_b32 s2, s1, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s2, s0, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_cmp_lt_u32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s2, s1, s3 -; GFX9-NEXT: s_sub_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s2 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s2 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s0, s2 -; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_cmp_lt_u32 s1, s3 -; GFX10-NEXT: s_cselect_b32 s2, s1, s3 -; GFX10-NEXT: s_sub_i32 s1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result @@ -1115,36 +948,27 @@ define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v3, v0, v3 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_min_u32_e32 v3, v1, v4 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_min_u32_e32 v3, v2, v5 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v3, v0, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v1, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v2, v5 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v3, v0, v3 -; GFX10-NEXT: v_min_u32_e32 v4, v1, v4 -; GFX10-NEXT: v_min_u32_e32 v5, v2, v5 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -1166,42 +990,39 @@ define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX8-LABEL: s_usubsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s3 -; GFX8-NEXT: s_cselect_b32 s3, s0, s3 -; GFX8-NEXT: s_sub_i32 s0, s0, s3 -; GFX8-NEXT: s_cmp_lt_u32 s1, s4 -; GFX8-NEXT: s_cselect_b32 s3, s1, s4 -; GFX8-NEXT: s_sub_i32 s1, s1, s3 -; GFX8-NEXT: s_cmp_lt_u32 s2, s5 -; GFX8-NEXT: s_cselect_b32 s3, s2, s5 -; GFX8-NEXT: s_sub_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_sub_u32_e64 v0, s[6:7], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v3i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s3 -; GFX9-NEXT: s_cselect_b32 s3, s0, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_cmp_lt_u32 s1, s4 -; GFX9-NEXT: s_cselect_b32 s3, s1, s4 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_cmp_lt_u32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s3, s2, s5 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v3i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s3 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s3 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s3, s0, s3 -; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_cmp_lt_u32 s1, s4 -; GFX10-NEXT: s_cselect_b32 s3, s1, s4 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_cmp_lt_u32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s3, s2, s5 -; GFX10-NEXT: s_sub_i32 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result @@ -1224,42 +1045,30 @@ define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v4, v0, v4 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v1, v5 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v2, v6 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v3, v7 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v4, v0, v4 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v1, v5 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v2, v6 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v3, v7 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v19, v2, v6 -; GFX10-NEXT: v_min_u32_e32 v11, v0, v4 -; GFX10-NEXT: v_min_u32_e32 v15, v1, v5 -; GFX10-NEXT: v_min_u32_e32 v6, v3, v7 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v15 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -1284,51 +1093,47 @@ define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX8-LABEL: s_usubsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s4 -; GFX8-NEXT: s_cselect_b32 s4, s0, s4 -; GFX8-NEXT: s_sub_i32 s0, s0, s4 -; GFX8-NEXT: s_cmp_lt_u32 s1, s5 -; GFX8-NEXT: s_cselect_b32 s4, s1, s5 -; GFX8-NEXT: s_sub_i32 s1, s1, s4 -; GFX8-NEXT: s_cmp_lt_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s4, s2, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s4 -; GFX8-NEXT: s_cmp_lt_u32 s3, s7 -; GFX8-NEXT: s_cselect_b32 s4, s3, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_sub_u32_e64 v0, s[8:9], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s4 -; GFX9-NEXT: s_cselect_b32 s4, s0, s4 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_cmp_lt_u32 s1, s5 -; GFX9-NEXT: s_cselect_b32 s4, s1, s5 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_cmp_lt_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s4, s2, s6 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_u32 s3, s7 -; GFX9-NEXT: s_cselect_b32 s4, s3, s7 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s4 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s4 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s4, s0, s4 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_cmp_lt_u32 s1, s5 -; GFX10-NEXT: s_cselect_b32 s4, s1, s5 -; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_cmp_lt_u32 s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, s2, s6 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_u32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s4, s3, s7 -; GFX10-NEXT: s_sub_i32 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result @@ -1353,47 +1158,32 @@ define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v5, v0, v5 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v1, v6 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v2, v7 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v3, v8 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v4, v9 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v7 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v8 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v9 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v1, v6 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v2, v7 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v3, v8 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v4, v9 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v7 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v8 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v5i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v5, v0, v5 -; GFX10-NEXT: v_min_u32_e32 v6, v1, v6 -; GFX10-NEXT: v_min_u32_e32 v7, v2, v7 -; GFX10-NEXT: v_min_u32_e32 v8, v3, v8 -; GFX10-NEXT: v_min_u32_e32 v9, v4, v9 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v8 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v9 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v7 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v8 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v9 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) @@ -1422,60 +1212,55 @@ define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX8-LABEL: s_usubsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s5 -; GFX8-NEXT: s_cselect_b32 s5, s0, s5 -; GFX8-NEXT: s_sub_i32 s0, s0, s5 -; GFX8-NEXT: s_cmp_lt_u32 s1, s6 -; GFX8-NEXT: s_cselect_b32 s5, s1, s6 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 -; GFX8-NEXT: s_cmp_lt_u32 s2, s7 -; GFX8-NEXT: s_cselect_b32 s5, s2, s7 -; GFX8-NEXT: s_sub_i32 s2, s2, s5 -; GFX8-NEXT: s_cmp_lt_u32 s3, s8 -; GFX8-NEXT: s_cselect_b32 s5, s3, s8 -; GFX8-NEXT: s_sub_i32 s3, s3, s5 -; GFX8-NEXT: s_cmp_lt_u32 s4, s9 -; GFX8-NEXT: s_cselect_b32 s5, s4, s9 -; GFX8-NEXT: s_sub_i32 s4, s4, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_sub_u32_e64 v0, s[10:11], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[0:1], s4, v4 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v5i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s5 -; GFX9-NEXT: s_cselect_b32 s5, s0, s5 -; GFX9-NEXT: s_sub_i32 s0, s0, s5 -; GFX9-NEXT: s_cmp_lt_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s5, s1, s6 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_cmp_lt_u32 s2, s7 -; GFX9-NEXT: s_cselect_b32 s5, s2, s7 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_cmp_lt_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s5, s3, s8 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s9 -; GFX9-NEXT: s_cselect_b32 s5, s4, s9 -; GFX9-NEXT: s_sub_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v5i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s5 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s5 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s6 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s7 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s8 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, s4, s9 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s0, s5 -; GFX10-NEXT: s_sub_i32 s0, s0, s5 -; GFX10-NEXT: s_cmp_lt_u32 s1, s6 -; GFX10-NEXT: s_cselect_b32 s5, s1, s6 -; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_cmp_lt_u32 s2, s7 -; GFX10-NEXT: s_cselect_b32 s5, s2, s7 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_cmp_lt_u32 s3, s8 -; GFX10-NEXT: s_cselect_b32 s5, s3, s8 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_u32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s5, s4, s9 -; GFX10-NEXT: s_sub_i32 s4, s4, s5 ; GFX10-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) ret <5 x i32> %result @@ -1522,113 +1307,65 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8-LABEL: v_usubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v16, v0, v16 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v2, v18 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v3, v19 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v12, v28 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v13, v29 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v14, v30 -; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v15, v31 -; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp +; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp +; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp +; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp +; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp +; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp +; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp +; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp +; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp +; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp +; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v31 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v16, v0, v16 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v2, v18 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v3, v19 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX9-NEXT: v_sub_u32_e32 v7, v7, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX9-NEXT: v_sub_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX9-NEXT: v_sub_u32_e32 v9, v9, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX9-NEXT: v_sub_u32_e32 v10, v10, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX9-NEXT: v_sub_u32_e32 v11, v11, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v12, v28 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v13, v29 -; GFX9-NEXT: v_sub_u32_e32 v13, v13, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v14, v30 -; GFX9-NEXT: v_sub_u32_e32 v14, v14, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v15, v31 -; GFX9-NEXT: v_sub_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp +; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp +; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp +; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp +; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp +; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp +; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp +; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp +; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp +; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp +; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp +; GFX9-NEXT: v_sub_u32_e64 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v35, v0, v16 -; GFX10-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX10-NEXT: v_min_u32_e32 v17, v2, v18 -; GFX10-NEXT: v_min_u32_e32 v18, v3, v19 -; GFX10-NEXT: v_min_u32_e32 v19, v4, v20 -; GFX10-NEXT: v_min_u32_e32 v20, v5, v21 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v16 -; GFX10-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v17 -; GFX10-NEXT: v_min_u32_e32 v17, v7, v23 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v18 -; GFX10-NEXT: v_min_u32_e32 v18, v8, v24 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v19 -; GFX10-NEXT: v_min_u32_e32 v19, v9, v25 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v20 -; GFX10-NEXT: v_min_u32_e32 v20, v10, v26 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, v6, v16 -; GFX10-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, v7, v17 -; GFX10-NEXT: v_min_u32_e32 v17, v12, v28 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, v8, v18 -; GFX10-NEXT: v_min_u32_e32 v18, v13, v29 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, v9, v19 -; GFX10-NEXT: v_min_u32_e32 v19, v14, v30 -; GFX10-NEXT: v_sub_nc_u32_e32 v10, v10, v20 -; GFX10-NEXT: v_min_u32_e32 v20, v15, v31 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v35 -; GFX10-NEXT: v_sub_nc_u32_e32 v11, v11, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v12, v12, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v13, v13, v18 -; GFX10-NEXT: v_sub_nc_u32_e32 v14, v14, v19 -; GFX10-NEXT: v_sub_nc_u32_e32 v15, v15, v20 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) @@ -1690,159 +1427,143 @@ define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX8-LABEL: s_usubsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_cmp_lt_u32 s0, s16 -; GFX8-NEXT: s_cselect_b32 s16, s0, s16 -; GFX8-NEXT: s_sub_i32 s0, s0, s16 -; GFX8-NEXT: s_cmp_lt_u32 s1, s17 -; GFX8-NEXT: s_cselect_b32 s16, s1, s17 -; GFX8-NEXT: s_sub_i32 s1, s1, s16 -; GFX8-NEXT: s_cmp_lt_u32 s2, s18 -; GFX8-NEXT: s_cselect_b32 s16, s2, s18 -; GFX8-NEXT: s_sub_i32 s2, s2, s16 -; GFX8-NEXT: s_cmp_lt_u32 s3, s19 -; GFX8-NEXT: s_cselect_b32 s16, s3, s19 -; GFX8-NEXT: s_sub_i32 s3, s3, s16 -; GFX8-NEXT: s_cmp_lt_u32 s4, s20 -; GFX8-NEXT: s_cselect_b32 s16, s4, s20 -; GFX8-NEXT: s_sub_i32 s4, s4, s16 -; GFX8-NEXT: s_cmp_lt_u32 s5, s21 -; GFX8-NEXT: s_cselect_b32 s16, s5, s21 -; GFX8-NEXT: s_sub_i32 s5, s5, s16 -; GFX8-NEXT: s_cmp_lt_u32 s6, s22 -; GFX8-NEXT: s_cselect_b32 s16, s6, s22 -; GFX8-NEXT: s_sub_i32 s6, s6, s16 -; GFX8-NEXT: s_cmp_lt_u32 s7, s23 -; GFX8-NEXT: s_cselect_b32 s16, s7, s23 -; GFX8-NEXT: s_sub_i32 s7, s7, s16 -; GFX8-NEXT: s_cmp_lt_u32 s8, s24 -; GFX8-NEXT: s_cselect_b32 s16, s8, s24 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 -; GFX8-NEXT: s_cmp_lt_u32 s9, s25 -; GFX8-NEXT: s_cselect_b32 s16, s9, s25 -; GFX8-NEXT: s_sub_i32 s9, s9, s16 -; GFX8-NEXT: s_cmp_lt_u32 s10, s26 -; GFX8-NEXT: s_cselect_b32 s16, s10, s26 -; GFX8-NEXT: s_sub_i32 s10, s10, s16 -; GFX8-NEXT: s_cmp_lt_u32 s11, s27 -; GFX8-NEXT: s_cselect_b32 s16, s11, s27 -; GFX8-NEXT: s_sub_i32 s11, s11, s16 -; GFX8-NEXT: s_cmp_lt_u32 s12, s28 -; GFX8-NEXT: s_cselect_b32 s16, s12, s28 -; GFX8-NEXT: s_sub_i32 s12, s12, s16 -; GFX8-NEXT: s_cmp_lt_u32 s13, s29 -; GFX8-NEXT: s_cselect_b32 s16, s13, s29 -; GFX8-NEXT: s_sub_i32 s13, s13, s16 -; GFX8-NEXT: s_cmp_lt_u32 s14, s30 -; GFX8-NEXT: s_cselect_b32 s16, s14, s30 -; GFX8-NEXT: s_sub_i32 s14, s14, s16 -; GFX8-NEXT: s_cmp_lt_u32 s15, s31 -; GFX8-NEXT: s_cselect_b32 s16, s15, s31 -; GFX8-NEXT: s_sub_i32 s15, s15, s16 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s25 +; GFX8-NEXT: v_mov_b32_e32 v10, s26 +; GFX8-NEXT: v_mov_b32_e32 v11, s27 +; GFX8-NEXT: v_mov_b32_e32 v12, s28 +; GFX8-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 +; GFX8-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NEXT: v_sub_u32_e64 v0, s[32:33], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[16:17], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[16:17], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[2:3], s3, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[2:3], s4, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v5, s[2:3], s5, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v6, s[2:3], s6, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v7, s[2:3], s7, v7 clamp +; GFX8-NEXT: v_sub_u32_e64 v8, s[2:3], s8, v8 clamp +; GFX8-NEXT: v_sub_u32_e64 v9, s[2:3], s9, v9 clamp +; GFX8-NEXT: v_sub_u32_e64 v10, s[2:3], s10, v10 clamp +; GFX8-NEXT: v_sub_u32_e64 v11, s[2:3], s11, v11 clamp +; GFX8-NEXT: v_sub_u32_e64 v12, s[2:3], s12, v12 clamp +; GFX8-NEXT: v_sub_u32_e64 v13, s[2:3], s13, v13 clamp +; GFX8-NEXT: v_sub_u32_e64 v14, s[2:3], s14, v14 clamp +; GFX8-NEXT: v_sub_u32_e64 v15, s[2:3], s15, v15 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8-NEXT: v_readfirstlane_b32 s6, v6 +; GFX8-NEXT: v_readfirstlane_b32 s7, v7 +; GFX8-NEXT: v_readfirstlane_b32 s8, v8 +; GFX8-NEXT: v_readfirstlane_b32 s9, v9 +; GFX8-NEXT: v_readfirstlane_b32 s10, v10 +; GFX8-NEXT: v_readfirstlane_b32 s11, v11 +; GFX8-NEXT: v_readfirstlane_b32 s12, v12 +; GFX8-NEXT: v_readfirstlane_b32 s13, v13 +; GFX8-NEXT: v_readfirstlane_b32 s14, v14 +; GFX8-NEXT: v_readfirstlane_b32 s15, v15 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v16i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_cmp_lt_u32 s0, s16 -; GFX9-NEXT: s_cselect_b32 s16, s0, s16 -; GFX9-NEXT: s_sub_i32 s0, s0, s16 -; GFX9-NEXT: s_cmp_lt_u32 s1, s17 -; GFX9-NEXT: s_cselect_b32 s16, s1, s17 -; GFX9-NEXT: s_sub_i32 s1, s1, s16 -; GFX9-NEXT: s_cmp_lt_u32 s2, s18 -; GFX9-NEXT: s_cselect_b32 s16, s2, s18 -; GFX9-NEXT: s_sub_i32 s2, s2, s16 -; GFX9-NEXT: s_cmp_lt_u32 s3, s19 -; GFX9-NEXT: s_cselect_b32 s16, s3, s19 -; GFX9-NEXT: s_sub_i32 s3, s3, s16 -; GFX9-NEXT: s_cmp_lt_u32 s4, s20 -; GFX9-NEXT: s_cselect_b32 s16, s4, s20 -; GFX9-NEXT: s_sub_i32 s4, s4, s16 -; GFX9-NEXT: s_cmp_lt_u32 s5, s21 -; GFX9-NEXT: s_cselect_b32 s16, s5, s21 -; GFX9-NEXT: s_sub_i32 s5, s5, s16 -; GFX9-NEXT: s_cmp_lt_u32 s6, s22 -; GFX9-NEXT: s_cselect_b32 s16, s6, s22 -; GFX9-NEXT: s_sub_i32 s6, s6, s16 -; GFX9-NEXT: s_cmp_lt_u32 s7, s23 -; GFX9-NEXT: s_cselect_b32 s16, s7, s23 -; GFX9-NEXT: s_sub_i32 s7, s7, s16 -; GFX9-NEXT: s_cmp_lt_u32 s8, s24 -; GFX9-NEXT: s_cselect_b32 s16, s8, s24 -; GFX9-NEXT: s_sub_i32 s8, s8, s16 -; GFX9-NEXT: s_cmp_lt_u32 s9, s25 -; GFX9-NEXT: s_cselect_b32 s16, s9, s25 -; GFX9-NEXT: s_sub_i32 s9, s9, s16 -; GFX9-NEXT: s_cmp_lt_u32 s10, s26 -; GFX9-NEXT: s_cselect_b32 s16, s10, s26 -; GFX9-NEXT: s_sub_i32 s10, s10, s16 -; GFX9-NEXT: s_cmp_lt_u32 s11, s27 -; GFX9-NEXT: s_cselect_b32 s16, s11, s27 -; GFX9-NEXT: s_sub_i32 s11, s11, s16 -; GFX9-NEXT: s_cmp_lt_u32 s12, s28 -; GFX9-NEXT: s_cselect_b32 s16, s12, s28 -; GFX9-NEXT: s_sub_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_lt_u32 s13, s29 -; GFX9-NEXT: s_cselect_b32 s16, s13, s29 -; GFX9-NEXT: s_sub_i32 s13, s13, s16 -; GFX9-NEXT: s_cmp_lt_u32 s14, s30 -; GFX9-NEXT: s_cselect_b32 s16, s14, s30 -; GFX9-NEXT: s_sub_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_lt_u32 s15, s31 -; GFX9-NEXT: s_cselect_b32 s16, s15, s31 -; GFX9-NEXT: s_sub_i32 s15, s15, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v5, s5, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v6, s6, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v7, s7, v7 clamp +; GFX9-NEXT: v_sub_u32_e64 v8, s8, v8 clamp +; GFX9-NEXT: v_sub_u32_e64 v9, s9, v9 clamp +; GFX9-NEXT: v_sub_u32_e64 v10, s10, v10 clamp +; GFX9-NEXT: v_sub_u32_e64 v11, s11, v11 clamp +; GFX9-NEXT: v_sub_u32_e64 v12, s12, v12 clamp +; GFX9-NEXT: v_sub_u32_e64 v13, s13, v13 clamp +; GFX9-NEXT: v_sub_u32_e64 v14, s14, v14 clamp +; GFX9-NEXT: v_sub_u32_e64 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v16i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_cmp_lt_u32 s0, s16 +; GFX10-NEXT: v_sub_nc_u32_e64 v0, s0, s16 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v1, s1, s17 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v2, s2, s18 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v3, s3, s19 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v4, s4, s20 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v5, s5, s21 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v6, s6, s22 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v7, s7, s23 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v8, s8, s24 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v9, s9, s25 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v10, s10, s26 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v11, s11, s27 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v12, s12, s28 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v13, s13, s29 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v14, s14, s30 clamp +; GFX10-NEXT: v_sub_nc_u32_e64 v15, s15, s31 clamp +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 +; GFX10-NEXT: v_readfirstlane_b32 s14, v14 +; GFX10-NEXT: v_readfirstlane_b32 s15, v15 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s46, s0, s16 -; GFX10-NEXT: s_sub_i32 s0, s0, s46 -; GFX10-NEXT: s_cmp_lt_u32 s1, s17 -; GFX10-NEXT: s_cselect_b32 s46, s1, s17 -; GFX10-NEXT: s_sub_i32 s1, s1, s46 -; GFX10-NEXT: s_cmp_lt_u32 s2, s18 -; GFX10-NEXT: s_cselect_b32 s16, s2, s18 -; GFX10-NEXT: s_sub_i32 s2, s2, s16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s19 -; GFX10-NEXT: s_cselect_b32 s16, s3, s19 -; GFX10-NEXT: s_sub_i32 s3, s3, s16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s20 -; GFX10-NEXT: s_cselect_b32 s16, s4, s20 -; GFX10-NEXT: s_sub_i32 s4, s4, s16 -; GFX10-NEXT: s_cmp_lt_u32 s5, s21 -; GFX10-NEXT: s_cselect_b32 s16, s5, s21 -; GFX10-NEXT: s_sub_i32 s5, s5, s16 -; GFX10-NEXT: s_cmp_lt_u32 s6, s22 -; GFX10-NEXT: s_cselect_b32 s16, s6, s22 -; GFX10-NEXT: s_sub_i32 s6, s6, s16 -; GFX10-NEXT: s_cmp_lt_u32 s7, s23 -; GFX10-NEXT: s_cselect_b32 s16, s7, s23 -; GFX10-NEXT: s_sub_i32 s7, s7, s16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s24 -; GFX10-NEXT: s_cselect_b32 s16, s8, s24 -; GFX10-NEXT: s_sub_i32 s8, s8, s16 -; GFX10-NEXT: s_cmp_lt_u32 s9, s25 -; GFX10-NEXT: s_cselect_b32 s16, s9, s25 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s26 -; GFX10-NEXT: s_cselect_b32 s16, s10, s26 -; GFX10-NEXT: s_sub_i32 s10, s10, s16 -; GFX10-NEXT: s_cmp_lt_u32 s11, s27 -; GFX10-NEXT: s_cselect_b32 s16, s11, s27 -; GFX10-NEXT: s_sub_i32 s11, s11, s16 -; GFX10-NEXT: s_cmp_lt_u32 s12, s28 -; GFX10-NEXT: s_cselect_b32 s16, s12, s28 -; GFX10-NEXT: s_sub_i32 s12, s12, s16 -; GFX10-NEXT: s_cmp_lt_u32 s13, s29 -; GFX10-NEXT: s_cselect_b32 s16, s13, s29 -; GFX10-NEXT: s_sub_i32 s13, s13, s16 -; GFX10-NEXT: s_cmp_lt_u32 s14, s30 -; GFX10-NEXT: s_cselect_b32 s16, s14, s30 -; GFX10-NEXT: s_sub_i32 s14, s14, s16 -; GFX10-NEXT: s_cmp_lt_u32 s15, s31 -; GFX10-NEXT: s_cselect_b32 s16, s15, s31 -; GFX10-NEXT: s_sub_i32 s15, s15, s16 ; GFX10-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result @@ -1862,24 +1583,21 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-LABEL: v_usubsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u16_e64 v1, v0, v1 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -1898,30 +1616,23 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: s_usubsat_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s2, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cmp_lt_u32 s2, s1 -; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result @@ -1939,21 +1650,18 @@ define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; ; GFX8-LABEL: usubsat_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u16_e32 v0, s0, v0 -; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v0, s0, v0 -; GFX9-NEXT: v_sub_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u16_e64 v0, s0, v0 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -1972,21 +1680,18 @@ define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; ; GFX8-LABEL: usubsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u16_e64 v1, v0, s0 +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) %cast = bitcast i16 %result to half @@ -2012,28 +1717,25 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_usubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v3, v0, v1 -; GFX8-NEXT: v_min_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v2, v0, v1 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v1, v0, v1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_u16 v1, v0, v1 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result @@ -2065,57 +1767,28 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s1 -; GFX8-NEXT: s_cselect_b32 s1, s4, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s1, s3 -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_sub_i32 s1, s2, s1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v1, s2, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s4, s0, s3 -; GFX9-NEXT: s_and_b32 s1, s1, s3 -; GFX9-NEXT: s_cmp_lt_u32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_cmp_lt_u32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s3, s2, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_and_b32 s4, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s2 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_cmp_lt_u32 s3, s1 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s1 -; GFX10-NEXT: s_sub_i32 s1, s3, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to i32 @@ -2146,24 +1819,22 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX8-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_sub_u16_e64 v1, s0, v0 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_pk_min_u16 v0, s0, v0 -; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_min_u16 v0, s0, v0 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -2192,26 +1863,24 @@ define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: usubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_min_u16_e32 v2, s0, v0 -; GFX8-NEXT: v_min_u16_e32 v3, s1, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_sub_u16_e64 v1, v0, s0 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_pk_min_u16 v1, v0, s0 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: usubsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_min_u16 v1, v0, s0 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, s0 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) %cast = bitcast <2 x i16> %result to float @@ -2267,38 +1936,31 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_usubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v6, v0, v2 -; GFX8-NEXT: v_min_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v7, v1, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_sub_u16_e64 v4, v0, v2 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v2, v1, v3 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v2, v0, v2 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: v_pk_min_u16 v2, v1, v3 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_u16 v2, v0, v2 -; GFX10-NEXT: v_pk_min_u16 v3, v1, v3 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> @@ -2346,100 +2008,43 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-LABEL: s_usubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s2 -; GFX8-NEXT: s_cselect_b32 s2, s8, s2 -; GFX8-NEXT: s_sub_i32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s2, s6 -; GFX8-NEXT: s_sub_i32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s3 -; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_sub_i32 s3, s5, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_sub_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s5, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s6, s0, s5 -; GFX9-NEXT: s_and_b32 s2, s2, s5 -; GFX9-NEXT: s_cmp_lt_u32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_cmp_lt_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s6, s4, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s2, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s4, s1, s5 -; GFX9-NEXT: s_and_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_cmp_lt_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s4, s2, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_and_b32 s7, s2, s4 -; GFX10-NEXT: s_and_b32 s6, s0, s4 -; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_u32 s6, s7 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, s2 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, s1, s3 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s6, s7 -; GFX10-NEXT: s_cmp_lt_u32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s2 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_sub_i32 s2, s5, s6 -; GFX10-NEXT: s_and_b32 s6, s1, s4 -; GFX10-NEXT: s_and_b32 s4, s3, s4 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_cmp_lt_u32 s5, s3 -; GFX10-NEXT: s_cselect_b32 s3, s5, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s3, s5, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -2509,48 +2114,38 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_usubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v9, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v10, v1, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_min_u16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v11, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_sub_u16_e64 v6, v0, v3 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v3, v1, v4 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v3, v0, v3 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX9-NEXT: v_pk_min_u16 v3, v1, v4 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_min_u16 v3, v2, v5 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v6i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_u16 v3, v0, v3 -; GFX10-NEXT: v_pk_min_u16 v4, v1, v4 -; GFX10-NEXT: v_pk_min_u16 v5, v2, v5 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v3 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v1, v4 clamp +; GFX10-NEXT: v_pk_sub_u16 v2, v2, v5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x float> @@ -2614,142 +2209,57 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-LABEL: s_usubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_sub_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s12, s3 -; GFX8-NEXT: s_cselect_b32 s3, s12, s3 -; GFX8-NEXT: s_sub_i32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s9 -; GFX8-NEXT: s_cselect_b32 s3, s3, s9 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s4 -; GFX8-NEXT: s_cselect_b32 s4, s6, s4 -; GFX8-NEXT: s_sub_i32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s10, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s6 -; GFX8-NEXT: s_cselect_b32 s4, s4, s6 -; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s5 -; GFX8-NEXT: s_cselect_b32 s5, s6, s5 -; GFX8-NEXT: s_sub_i32 s2, s2, s5 -; GFX8-NEXT: s_bfe_u32 s5, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s11, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s6 -; GFX8-NEXT: s_cselect_b32 s5, s5, s6 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX8-NEXT: s_sub_i32 s5, s8, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v5, s8, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_and_b32 s3, s3, s7 -; GFX9-NEXT: s_cmp_lt_u32 s8, s3 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_cmp_lt_u32 s6, s9 -; GFX9-NEXT: s_cselect_b32 s8, s6, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s3, s6, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s6, s1, s7 -; GFX9-NEXT: s_and_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_lt_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cmp_lt_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s6, s3, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_and_b32 s4, s2, s7 -; GFX9-NEXT: s_and_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s5, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s6, 0xffff -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s9, s3, s6 -; GFX10-NEXT: s_and_b32 s8, s0, s6 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s9 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, s3 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, s1, s4 clamp +; GFX10-NEXT: v_pk_sub_u16 v2, s2, s5 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_u32 s7, s3 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_and_b32 s9, s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_sub_i32 s3, s7, s8 -; GFX10-NEXT: s_and_b32 s8, s1, s6 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_cselect_b32 s8, s8, s9 -; GFX10-NEXT: s_cmp_lt_u32 s7, s4 -; GFX10-NEXT: s_cselect_b32 s4, s7, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 -; GFX10-NEXT: s_lshr_b32 s8, s4, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_sub_i32 s4, s7, s8 -; GFX10-NEXT: s_and_b32 s8, s2, s6 -; GFX10-NEXT: s_and_b32 s6, s5, s6 -; GFX10-NEXT: s_lshr_b32 s7, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_cmp_lt_u32 s7, s5 -; GFX10-NEXT: s_cselect_b32 s5, s7, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s6, s5 -; GFX10-NEXT: s_lshr_b32 s3, s5, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_sub_i32 s3, s7, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -2822,58 +2332,44 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_usubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v12, v0, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v13, v1, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_min_u16_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v14, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_min_u16_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v15, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_min_u16_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_sub_u16_e64 v8, v0, v4 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v4, v1, v5 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v5, v2, v6 clamp +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v6, v3, v7 clamp +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v4, v0, v4 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v1, v5 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v2, v6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v3, v7 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_sub_u16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_min_u16 v19, v2, v6 -; GFX10-NEXT: v_pk_min_u16 v11, v0, v4 -; GFX10-NEXT: v_pk_min_u16 v15, v1, v5 -; GFX10-NEXT: v_pk_min_u16 v6, v3, v7 +; GFX10-NEXT: v_pk_sub_u16 v0, v0, v4 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v1, v5 clamp +; GFX10-NEXT: v_pk_sub_u16 v2, v2, v6 clamp +; GFX10-NEXT: v_pk_sub_u16 v3, v3, v7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, v2, v19 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v11 -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v15 -; GFX10-NEXT: v_pk_sub_i16 v3, v3, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x float> @@ -2953,184 +2449,71 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-LABEL: s_usubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_sub_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_bfe_u32 s16, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s16, s4 -; GFX8-NEXT: s_cselect_b32 s4, s16, s4 -; GFX8-NEXT: s_sub_i32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s12 -; GFX8-NEXT: s_cselect_b32 s4, s4, s12 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 -; GFX8-NEXT: s_bfe_u32 s8, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s5 -; GFX8-NEXT: s_cselect_b32 s5, s8, s5 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 -; GFX8-NEXT: s_bfe_u32 s5, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s13, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s8 -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s6 -; GFX8-NEXT: s_cselect_b32 s6, s8, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s6 -; GFX8-NEXT: s_bfe_u32 s6, s10, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s14, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s8 -; GFX8-NEXT: s_cselect_b32 s6, s6, s8 -; GFX8-NEXT: s_sub_i32 s6, s10, s6 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s7 -; GFX8-NEXT: s_cselect_b32 s7, s8, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, s7 -; GFX8-NEXT: s_bfe_u32 s7, s11, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s15, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s7, s8 -; GFX8-NEXT: s_cselect_b32 s7, s7, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s6, 0x100000 -; GFX8-NEXT: s_sub_i32 s7, s11, s7 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v7, s11, v7 clamp +; GFX8-NEXT: v_sub_u16_e64 v5, s10, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s4, s4, s9 -; GFX9-NEXT: s_cmp_lt_u32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_cmp_lt_u32 s8, s11 -; GFX9-NEXT: s_cselect_b32 s10, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s8, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s10, s5, 16 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s8, s1, s9 -; GFX9-NEXT: s_and_b32 s5, s5, s9 -; GFX9-NEXT: s_cmp_lt_u32 s8, s5 -; GFX9-NEXT: s_cselect_b32 s5, s8, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s8, s4, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s8 -; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s5, s2, s9 -; GFX9-NEXT: s_and_b32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_u32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_u32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s6, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s5, s3, s9 -; GFX9-NEXT: s_and_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_u32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s6, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_sub_u16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_usubsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_and_b32 s11, s4, s8 -; GFX10-NEXT: s_and_b32 s10, s0, s8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s11 +; GFX10-NEXT: v_pk_sub_u16 v0, s0, s4 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, s1, s5 clamp +; GFX10-NEXT: v_pk_sub_u16 v2, s2, s6 clamp +; GFX10-NEXT: v_pk_sub_u16 v3, s3, s7 clamp ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_u32 s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 -; GFX10-NEXT: s_and_b32 s11, s5, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s10, s4 -; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_sub_i32 s4, s9, s10 -; GFX10-NEXT: s_and_b32 s10, s1, s8 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_u32 s9, s5 -; GFX10-NEXT: s_cselect_b32 s5, s9, s5 -; GFX10-NEXT: s_and_b32 s11, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s10, s5 -; GFX10-NEXT: s_lshr_b32 s6, s6, 16 -; GFX10-NEXT: s_lshr_b32 s10, s5, 16 -; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_sub_i32 s5, s9, s10 -; GFX10-NEXT: s_and_b32 s10, s2, s8 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s6, s9, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s10, s6 -; GFX10-NEXT: s_lshr_b32 s10, s6, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s6 -; GFX10-NEXT: s_sub_i32 s6, s9, s10 -; GFX10-NEXT: s_and_b32 s10, s3, s8 -; GFX10-NEXT: s_and_b32 s8, s7, s8 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_lshr_b32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_u32 s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s8, s10, s8 -; GFX10-NEXT: s_cmp_lt_u32 s9, s7 -; GFX10-NEXT: s_cselect_b32 s7, s9, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s7 -; GFX10-NEXT: s_lshr_b32 s5, s4, 16 -; GFX10-NEXT: s_sub_i32 s3, s3, s4 -; GFX10-NEXT: s_sub_i32 s4, s9, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 2512aaaeb082c..474f6655bda2c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -50,16 +50,14 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; ; GFX900-LABEL: scalar_xnor_v2i16_one_use: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_pack_ll_b32_b16 s2, -1, -1 ; GFX900-NEXT: s_xor_b32 s0, s0, s1 -; GFX900-NEXT: s_xor_b32 s0, s0, s2 +; GFX900-NEXT: s_xor_b32 s0, s0, -1 ; GFX900-NEXT: ; return to shader part epilog ; ; GFX906-LABEL: scalar_xnor_v2i16_one_use: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_pack_ll_b32_b16 s2, -1, -1 ; GFX906-NEXT: s_xor_b32 s0, s0, s1 -; GFX906-NEXT: s_xor_b32 s0, s0, s2 +; GFX906-NEXT: s_xor_b32 s0, s0, -1 ; GFX906-NEXT: ; return to shader part epilog entry: %xor = xor <2 x i16> %a, %b @@ -150,7 +148,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX900-LABEL: scalar_xnor_v4i16_one_use: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX900-NEXT: s_mov_b32 s4, -1 ; GFX900-NEXT: s_mov_b32 s5, s4 ; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] @@ -158,7 +156,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX906-LABEL: scalar_xnor_v4i16_one_use: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX906-NEXT: s_mov_b32 s4, -1 ; GFX906-NEXT: s_mov_b32 s5, s4 ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll index dc899ed8ba98a..1ebca23e3eda7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -139,9 +139,8 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -149,9 +148,8 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -162,9 +160,8 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) { ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr @@ -177,11 +174,9 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -189,11 +184,9 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -204,11 +197,9 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) { ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir index 667a22ea0c1da..de839d9253e80 100644 --- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -28,10 +28,16 @@ define amdgpu_kernel void @a2_to_a2() #0 { ret void } define amdgpu_kernel void @a3_to_a3() #0 { ret void } define amdgpu_kernel void @a4_to_a4() #0 { ret void } + define amdgpu_kernel void @a4_to_a4_overlap() #0 { ret void } define amdgpu_kernel void @a8_to_a8() #0 { ret void } define amdgpu_kernel void @a16_to_a16() #0 { ret void } define amdgpu_kernel void @a_to_a_spill() #0 { ret void } + define amdgpu_kernel void @copy_sgpr_to_agpr_tuple() #0 { ret void } + define amdgpu_kernel void @copy_sgpr_to_agpr_tuple_kill() #0 { ret void } + define amdgpu_kernel void @copy_agpr_to_agpr_tuple() #0 { ret void } + define amdgpu_kernel void @copy_agpr_to_agpr_tuple_kill() #0 { ret void } + attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ... @@ -285,9 +291,9 @@ body: | liveins: $sgpr0_sgpr1 ; GCN-LABEL: name: s2_to_a2 ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1 + ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1 ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec @@ -302,11 +308,11 @@ body: | liveins: $sgpr0_sgpr1_sgpr2 ; GCN-LABEL: name: s3_to_a3 ; GCN: liveins: $sgpr0_sgpr1_sgpr2 - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2 ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec @@ -321,13 +327,13 @@ body: | liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-LABEL: name: s4_to_a4 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec @@ -342,17 +348,17 @@ body: | liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GCN-LABEL: name: s6_to_a6 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec @@ -367,21 +373,21 @@ body: | liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-LABEL: name: s8_to_a8 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr6, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec @@ -396,37 +402,37 @@ body: | liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN-LABEL: name: s16_to_a16 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr6, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr8 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr9 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr10 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr11, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr11 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr12, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr12 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr13, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr13 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr14, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr14 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec + ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec @@ -455,9 +461,9 @@ body: | bb.0: ; GCN-LABEL: name: a2_to_a2 ; GCN: $agpr0_agpr1 = IMPLICIT_DEF - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1 + ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec, implicit-def $agpr1_agpr2 + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1 ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr1_agpr2 $agpr0_agpr1 = IMPLICIT_DEF @@ -472,11 +478,11 @@ body: | bb.0: ; GCN-LABEL: name: a3_to_a3 ; GCN: $agpr0_agpr1_agpr2 = IMPLICIT_DEF - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr2, implicit $exec - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2 + ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit-def $agpr2_agpr3_agpr4 + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2 ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4 $agpr0_agpr1_agpr2 = IMPLICIT_DEF @@ -491,20 +497,39 @@ body: | bb.0: ; GCN-LABEL: name: a4_to_a4 ; GCN: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr3, implicit $exec - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr2, implicit $exec - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec, implicit $exec + ; GCN: $vgpr3 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr3, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF $agpr2_agpr3_agpr4_agpr5 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 ... +--- +name: a4_to_a4_overlap +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3 + ; GCN-LABEL: name: a4_to_a4_overlap + ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec + ; GCN: $vgpr3 = V_ACCVGPR_READ_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr3, implicit $exec, implicit $exec + ; GCN: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 + $agpr2_agpr3_agpr4_agpr5 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec + S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 +... + --- name: a8_to_a8 tracksRegLiveness: true @@ -512,21 +537,21 @@ body: | bb.0: ; GCN-LABEL: name: a8_to_a8 ; GCN: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr7, implicit $exec - ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr6, implicit $exec + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GCN: $agpr14 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr5, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GCN: $agpr13 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr4, implicit $exec + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GCN: $agpr12 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr3, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GCN: $agpr11 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr2, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GCN: $agpr10 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GCN: $agpr9 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GCN: $agpr8 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF @@ -542,37 +567,37 @@ body: | ; GCN-LABEL: name: a16_to_a16 ; GCN: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr15, implicit $exec - ; GCN: $agpr31 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr14, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GCN: $agpr31 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr13, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr29 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr12, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr28 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr11, implicit $exec + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr27 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr10, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr26 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr9, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr25 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr8, implicit $exec + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr24 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr7, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr23 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr6, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr22 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr5, implicit $exec + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr21 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr4, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr20 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr3, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr19 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr2, implicit $exec + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr18 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr17 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GCN: $agpr16 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit $exec ; GCN: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF @@ -599,3 +624,100 @@ body: | $agpr0 = COPY killed $agpr1, implicit $exec S_ENDPGM 0, implicit $agpr0 ... + +--- +name: copy_sgpr_to_agpr_tuple +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0, $sgpr2_sgpr3 + + ; GCN-LABEL: name: copy_sgpr_to_agpr_tuple + ; GCN: liveins: $agpr0, $sgpr2_sgpr3 + ; GCN: S_NOP 0, implicit-def dead $sgpr0_sgpr1 + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit $exec + ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + S_NOP 0, implicit-def dead $sgpr0_sgpr1 + renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 +... +--- +name: copy_sgpr_to_agpr_tuple_kill +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0, $sgpr2_sgpr3 + + ; GCN-LABEL: name: copy_sgpr_to_agpr_tuple_kill + ; GCN: liveins: $agpr0, $sgpr2_sgpr3 + ; GCN: S_NOP 0, implicit-def dead $sgpr0_sgpr1 + ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec + ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec + ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit $exec + ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 + S_NOP 0, implicit-def dead $sgpr0_sgpr1 + renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 +... + +--- +name: copy_agpr_to_agpr_tuple +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0, $agpr2_agpr3 + + ; GCN-LABEL: name: copy_agpr_to_agpr_tuple + ; GCN: liveins: $agpr0, $agpr2_agpr3 + ; GCN: S_NOP 0, implicit-def dead $agpr0_agpr1 + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit $exec + ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 + S_NOP 0, implicit-def dead $agpr0_agpr1 + renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec + S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 +... + +--- +name: copy_agpr_to_agpr_tuple_kill +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0, $agpr2_agpr3 + + ; GCN-LABEL: name: copy_agpr_to_agpr_tuple_kill + ; GCN: liveins: $agpr0, $agpr2_agpr3 + ; GCN: S_NOP 0, implicit-def dead $agpr0_agpr1 + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32 killed $vgpr2, implicit $exec + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32 killed $vgpr1, implicit $exec, implicit $exec + ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 + S_NOP 0, implicit-def dead $agpr0_agpr1 + renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 +... + +--- diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll new file mode 100644 index 0000000000000..223efcc738188 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll @@ -0,0 +1,7 @@ +; RUN: not --crash llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*) + +@lds.arr = unnamed_addr addrspace(3) global [256 x i32] undef, align 4 + +@gv_flatptr_from_lds = unnamed_addr addrspace(2) global i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer.ll new file mode 100644 index 0000000000000..4f5082f9bd08a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +; CHECK: global.arr: +; CHECK: .zero 1024 +; CHECK: .size global.arr, 1024 + +; CHECK: gv_flatptr_from_global: +; CHECK: .quad global.arr+32 +; CHECK: .size gv_flatptr_from_global, 8 + +; CHECK: gv_global_ptr: +; CHECK: .quad global.arr+32 +; CHECK: .size gv_global_ptr, 8 + +; CHECK: gv_flatptr_from_constant: +; CHECK: .quad constant.arr+32 +; CHECK: .size gv_flatptr_from_constant, 8 + +@global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4 +@constant.arr = external unnamed_addr addrspace(4) global [256 x i32], align 4 + +@gv_flatptr_from_global = unnamed_addr addrspace(4) global i32 addrspace(0)* getelementptr ([256 x i32], [256 x i32] addrspace(0)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(0)*), i64 0, i64 8), align 4 + + +@gv_global_ptr = unnamed_addr addrspace(4) global i32 addrspace(1)* getelementptr ([256 x i32], [256 x i32] addrspace(1)* @global.arr, i64 0, i64 8), align 4 + +@gv_flatptr_from_constant = unnamed_addr addrspace(4) global i32 addrspace(0)* getelementptr ([256 x i32], [256 x i32] addrspace(0)* addrspacecast ([256 x i32] addrspace(4)* @constant.arr to [256 x i32] addrspace(0)*), i64 0, i64 8), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll index fde3ab8c6d4a4..be819ceb88ccf 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll @@ -23,8 +23,6 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res ret void } -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -; Function Attrs: nounwind writeonly declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #1 ; Function Attrs: nounwind readnone speculatable diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll index 569f0101e3bee..d6e977bd20e10 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll @@ -1,5 +1,9 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -filetype=obj -o %t.o < %s && llvm-readobj -relocations %t.o | FileCheck --check-prefix=ELF %s + +; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -filetype=obj -o %t.o < %s && llvm-readobj -relocations %t.o | FileCheck --check-prefix=ELF %s + ; GCN-LABEL: {{^}}ps_main: ; GCN: v_mov_b32_{{.*}} v[[relocreg:[0-9]+]], doff_0_0_b@abs32@lo ; GCN-NEXT: exp {{.*}} v[[relocreg]], {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index 3d75eca93cb48..8f4f1c3915351 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs ; TRAP-HANDLER-ENABLE: NumSgprs: 61 -; TRAP-HANDLER-DISABLE: NumSgprs: 79 +; TRAP-HANDLER-DISABLE: NumSgprs: 77 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 1cdd6f4e37105..884457b319f4b 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -493,11 +493,11 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { ; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v3i16: @@ -515,27 +515,27 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) { ; SI-LABEL: v_bswap_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v3, v3, 8 -; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 +; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 ; SI-NEXT: s_mov_b32 s4, 0xff00ff ; SI-NEXT: s_mov_b32 s5, 0xffff0000 -; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: v_alignbit_b32 v6, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_alignbit_b32 v7, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v7 +; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8 +; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 +; SI-NEXT: v_alignbit_b32 v7, v2, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 +; SI-NEXT: v_and_b32_e32 v4, s5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v3, s5, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 84eb94228decc..2986cb48a86ea 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -840,14 +840,14 @@ entry: ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG8]], {{.*$}} ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 @@ -874,14 +874,14 @@ entry: ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG8]], {{.*$}} ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 diff --git a/llvm/test/CodeGen/AMDGPU/call-constant.ll b/llvm/test/CodeGen/AMDGPU/call-constant.ll index 11f4b3c0b913a..9a52ccf541dfb 100644 --- a/llvm/test/CodeGen/AMDGPU/call-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constant.ll @@ -1,11 +1,12 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -amdgpu-fixed-function-abi=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -amdgpu-fixed-function-abi=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,GISEL %s ; FIXME: Emitting unnecessary flat_scratch setup ; GCN-LABEL: {{^}}test_call_undef: -; GCN: s_mov_b32 flat_scratch_lo, s5 -; GCN: s_add_u32 s4, s4, s7 -; GCN: s_lshr_b32 +; SDAG: s_mov_b32 flat_scratch_lo, s5 +; SDAG: s_add_u32 s4, s4, s7 +; SDAG: s_lshr_b32 ; GCN: s_endpgm define amdgpu_kernel void @test_call_undef() #0 { %val = call i32 undef(i32 1) @@ -15,17 +16,21 @@ define amdgpu_kernel void @test_call_undef() #0 { } ; GCN-LABEL: {{^}}test_tail_call_undef: -; GCN: s_waitcnt -; GCN-NEXT: .Lfunc_end +; SDAG: s_waitcnt +; SDAG-NEXT: .Lfunc_end + +; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} define i32 @test_tail_call_undef() #0 { %call = tail call i32 undef(i32 1) ret i32 %call } ; GCN-LABEL: {{^}}test_call_null: -; GCN: s_mov_b32 flat_scratch_lo, s5 -; GCN: s_add_u32 s4, s4, s7 -; GCN: s_lshr_b32 +; SDAG: s_mov_b32 flat_scratch_lo, s5 +; SDAG: s_add_u32 s4, s4, s7 +; SDAG: s_lshr_b32 + +; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define amdgpu_kernel void @test_call_null() #0 { %val = call i32 null(i32 1) @@ -35,8 +40,10 @@ define amdgpu_kernel void @test_call_null() #0 { } ; GCN-LABEL: {{^}}test_tail_call_null: -; GCN: s_waitcnt -; GCN-NEXT: .Lfunc_end +; SDAG: s_waitcnt +; SDAG-NEXT: .Lfunc_end + +; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define i32 @test_tail_call_null() #0 { %call = tail call i32 null(i32 1) ret i32 %call diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll new file mode 100644 index 0000000000000..0f655dadfa11d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s + +; Make sure we match the addressing mode offset of csub intrinsics across blocks. + +define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32( +; OPT-NEXT: entry: +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999 +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3 +; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 +; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] +; OPT: if: +; OPT-NEXT: [[TMP0:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to i8 addrspace(1)* +; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28 +; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)* +; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[TMP1]], i32 2) +; OPT-NEXT: br label [[ENDIF]] +; OPT: endif: +; OPT-NEXT: [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ] +; OPT-NEXT: store i32 [[X]], i32 addrspace(1)* [[OUT_GEP]], align 4 +; OPT-NEXT: ret void +; +; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GCN-NEXT: s_cbranch_execz BB0_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, 2 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off offset:28 glc +; GCN-NEXT: BB0_2: ; %endif +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_co_u32_e64 v1, s0, 0x3d0800, s0 +; GCN-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, s1, s0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dword v[1:2], v0, off offset:252 +; GCN-NEXT: s_endpgm +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999 + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %cmp = icmp eq i32 %tid, 0 + br i1 %cmp, label %endif, label %if + +if: + %val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %in.gep, i32 2) + br label %endif + +endif: + %x = phi i32 [ %val, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2 + +attributes #0 = { argmemonly nounwind } +attributes #1 = { nounwind readnone willreturn } +attributes #2 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll new file mode 100644 index 0000000000000..840a4ec3dac8f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s + +; Make sure we match the addressing mode offset of globla.atomic.fadd intrinsics across blocks. + +define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32( +; OPT-NEXT: entry: +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999 +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3 +; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 +; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] +; OPT: if: +; OPT-NEXT: [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)* +; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28 +; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)* +; OPT-NEXT: call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00) +; OPT-NEXT: [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4 +; OPT-NEXT: br label [[ENDIF]] +; OPT: endif: +; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; OPT-NEXT: store float [[X]], float addrspace(1)* [[OUT_GEP]], align 4 +; OPT-NEXT: ret void +; +; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz BB0_2 +; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, 2.0 +; GCN-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:28 +; GCN-NEXT: global_load_dword v0, v[0:1], off +; GCN-NEXT: BB0_2: ; %endif +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_add_co_u32_e32 v1, vcc, 0x3d0000, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dword v[1:2], v0, off offset:2300 +; GCN-NEXT: s_endpgm +entry: + %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999 + %in.gep = getelementptr float, float addrspace(1)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %cmp = icmp eq i32 %tid, 0 + br i1 %cmp, label %endif, label %if + +if: + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %in.gep, float 2.0) + %val = load volatile float, float addrspace(1)* undef + br label %endif + +endif: + %x = phi float [ %val, %if ], [ 0.0, %entry ] + store float %x, float addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2 + +attributes #0 = { argmemonly nounwind } +attributes #1 = { nounwind readnone willreturn } +attributes #2 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index a2fb893af5404..bb621b294bbc1 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -748,10 +748,65 @@ done: ret void } +; OPT-LABEL: @test_sink_small_offset_ds_append( +; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* +; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 +; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* +; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %1, i1 false) +define amdgpu_kernel void @test_sink_small_offset_ds_append(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +entry: + %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 + %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %in.gep, i1 false) + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(3)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_small_offset_ds_consume( +; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)* +; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28 +; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)* +; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %1, i1 false) +define amdgpu_kernel void @test_sink_small_offset_ds_consume(i32 addrspace(3)* %out, i32 addrspace(3)* %in) { +entry: + %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999 + %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %in.gep, i1 false) + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(3)* %out.gep + br label %done + +done: + ret void +} + declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #3 +declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #3 attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind argmemonly } +attributes #3 = { argmemonly convergent nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 6bc3073eb18e0..27698c58e3c03 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -177,13 +177,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n ; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] -; SI-SDWA: v_or_b32_sdwa ; SI-SDWA: v_or_b32_e32 +; SI-SDWA: v_or_b32_sdwa ; SI-SDWA: v_or_b32_e32 -; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] ; SI-SDWA: v_or_b32_sdwa +; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] ; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] ; SI: v_cmp_eq_u32_e32 vcc, 0 ; SI: v_cmp_ne_u64_e32 vcc, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 21360aa85cbc1..f520b4a8fd8f4 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -636,19 +636,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1) define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s8, 0xff -; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_movk_i32 s0, 0xff +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 @@ -658,40 +659,39 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s8, v4 +; SI-NEXT: v_and_b32_e32 v0, s0, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 ; SI-NEXT: v_or_b32_e32 v0, v7, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; SI-NEXT: v_and_b32_e32 v2, s8, v2 +; SI-NEXT: v_and_b32_e32 v2, s0, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: v_mov_b32_e32 v5, 9 -; VI-NEXT: s_movk_i32 s8, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_movk_i32 s0, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 @@ -705,11 +705,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_add_u16_e32 v0, s8, v0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u16_e32 v0, s0, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -747,16 +747,17 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 -; SI-NEXT: v_or_b32_e32 v2, v9, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v8 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index f7f075b095317..0ba4d8d82bdf8 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -311,7 +311,7 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrsp %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id %load = load float, float addrspace(1)* %gep, align 4 - %v = fsub float -0.0, %load + %v = fneg float %load %canonicalized = tail call float @llvm.canonicalize.f32(float %v) store float %canonicalized, float addrspace(1)* %gep, align 4 ret void @@ -327,7 +327,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id %load = load float, float addrspace(1)* %gep, align 4 %v0 = fadd float %load, 0.0 - %v = fsub float -0.0, %v0 + %v = fneg float %v0 %canonicalized = tail call float @llvm.canonicalize.f32(float %v) store float %canonicalized, float addrspace(1)* %gep, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 7a44d11ad091d..c6af4baee6509 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -77,7 +77,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* % define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %val.fabs = call half @llvm.fabs.f16(half %val) - %val.fabs.fneg = fsub half -0.0, %val.fabs + %val.fabs.fneg = fneg half %val.fabs %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg) store half %canonicalized, half addrspace(1)* %out ret void @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace( ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out - %val.fneg = fsub half -0.0, %val + %val.fneg = fneg half %val %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) store half %canonicalized, half addrspace(1)* %out ret void @@ -103,7 +103,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* % ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 { %val = load half, half addrspace(1)* %out - %val.fneg = fsub half -0.0, %val + %val.fneg = fneg half %val %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) store half %canonicalized, half addrspace(1)* %out ret void @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half ad define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #2 { %val = load half, half addrspace(1)* %out %val.fabs = call half @llvm.fabs.f16(half %val) - %val.fabs.fneg = fsub half -0.0, %val.fabs + %val.fabs.fneg = fneg half %val.fabs %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg) store half %canonicalized, half addrspace(1)* %out ret void @@ -323,7 +323,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid %val = load <2 x half>, <2 x half> addrspace(1)* %gep %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) - %val.fabs.fneg = fsub <2 x half> , %val.fabs + %val.fabs.fneg = fneg <2 x half> %val.fabs %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -340,7 +340,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid %val = load <2 x half>, <2 x half> addrspace(1)* %gep - %fneg.val = fsub <2 x half> , %val + %fneg.val = fneg <2 x half> %val %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index e302d59b1ac74..d95194a8716df 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -57,7 +57,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fabs = call float @llvm.fabs.f32(float %val) - %val.fabs.fneg = fsub float -0.0, %val.fabs + %val.fabs.fneg = fneg float %val.fabs %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs.fneg) store float %canonicalized, float addrspace(1)* %out ret void @@ -69,7 +69,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out - %val.fneg = fsub float -0.0, %val + %val.fneg = fneg float %val %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg) store float %canonicalized, float addrspace(1)* %out ret void @@ -264,7 +264,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fabs = call double @llvm.fabs.f64(double %val) - %val.fabs.fneg = fsub double -0.0, %val.fabs + %val.fabs.fneg = fneg double %val.fabs %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs.fneg) store double %canonicalized, double addrspace(1)* %out ret void @@ -275,7 +275,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspac ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out - %val.fneg = fsub double -0.0, %val + %val.fneg = fneg double %val %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg) store double %canonicalized, double addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index 197bdd77d2544..d62155c1220dc 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -311,7 +311,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) { ; GCN: global_store_dwordx4 define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 - %neg = fsub <4 x float> , %load + %neg = fneg <4 x float> %load %div = fdiv <4 x float> , %neg, !fpmath !0 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 ret void diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll index 3375efa14dd9b..f4c8f67bbd635 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll @@ -1,11 +1,5 @@ ; RUN: llc -mtriple=amdgcn--amdpal -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0 -declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 -declare i32 @llvm.amdgcn.wwm.i32(i32) #1 -declare void @llvm.amdgcn.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #2 -declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #2 - define amdgpu_hs void @foo(i32 inreg %arg, <4 x i32> inreg %buffer) { entry: br label %work @@ -19,7 +13,7 @@ bb602: br i1 %tmp607, label %bb49, label %bb54 bb49: - tail call void @llvm.amdgcn.tbuffer.store.f32(float 1.000000e+00, <4 x i32> %buffer, i32 0, i32 1, i32 1, i32 4, i32 4, i32 7, i1 true, i1 false) #7 + call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, <4 x i32> %buffer, i32 4, i32 1, i32 116, i32 1) ret void bb54: @@ -42,6 +36,10 @@ work: br i1 %tmp34, label %bb602, label %bb42 } -attributes #0 = { convergent nounwind readnone } -attributes #1 = { nounwind readnone speculatable } -attributes #2 = { nounwind writeonly } +declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 +declare i32 @llvm.amdgcn.wwm.i32(i32) #1 +declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2 + +attributes #0 = { convergent nounwind readnone willreturn } +attributes #1 = { convergent nounwind readnone speculatable willreturn } +attributes #2 = { nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 58d61ccaf97bb..293f59275372d 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -32,6 +32,46 @@ define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1 ret void } +; GCN-LABEL: {{^}}fmul_fadd_f16: +; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX10-FLUSH: v_mul_f16_e32 +; GFX10-FLUSH: v_add_f16_e32 +; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +define amdgpu_kernel void @fmul_fadd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, + half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { + %r0 = load half, half addrspace(1)* %in1 + %r1 = load half, half addrspace(1)* %in2 + %r2 = load half, half addrspace(1)* %in3 + %mul = fmul half %r0, %r1 + %add = fadd half %mul, %r2 + store half %add, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_fadd_contract_f16: +; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX10-FLUSH: v_mul_f16_e32 +; GFX10-FLUSH: v_add_f16_e32 +; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half addrspace(1)* %in1, + half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { + %r0 = load half, half addrspace(1)* %in1 + %r1 = load half, half addrspace(1)* %in2 + %r2 = load half, half addrspace(1)* %in3 + %mul = fmul half %r0, %r1 + %add = fadd contract half %mul, %r2 + store half %add, half addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], @@ -216,7 +256,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, %r1 = load volatile half, half addrspace(1)* %gep.0 %r2 = load volatile half, half addrspace(1)* %gep.1 - %r1.fneg = fsub half -0.000000e+00, %r1 + %r1.fneg = fneg half %r1 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2) store half %r3, half addrspace(1)* %gep.out @@ -247,7 +287,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half %r1 = load volatile half, half addrspace(1)* %gep.0 %r2 = load volatile half, half addrspace(1)* %gep.1 - %r1.fneg = fsub half -0.000000e+00, %r1 + %r1.fneg = fneg half %r1 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2) store half %r3, half addrspace(1)* %gep.out @@ -272,7 +312,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half %r1 = load volatile half, half addrspace(1)* %gep.0 %r2 = load volatile half, half addrspace(1)* %gep.1 - %r2.fneg = fsub half -0.000000e+00, %r2 + %r2.fneg = fneg half %r2 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg) store half %r3, half addrspace(1)* %gep.out @@ -454,8 +494,8 @@ define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %a = load volatile half, half addrspace(1)* %gep0, align 2 %b = load volatile half, half addrspace(1)* %gep1, align 2 %c = load volatile half, half addrspace(1)* %gep2, align 2 - %nega = fsub half -0.000000e+00, %a - %negb = fsub half -0.000000e+00, %b + %nega = fneg half %a + %negb = fneg half %b %mul = fmul half %nega, %negb %sub = fadd half %mul, %c store half %sub, half addrspace(1)* %outgep, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll index 7b38b79c78a15..b97d36c046aff 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -69,6 +69,24 @@ define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspa ret void } +; GCN-LABEL: {{^}}fmul_fadd_contract_f32: +; GCN-FLUSH-FMAC: v_fmac_f32_e32 + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 + +; GCN-DENORM-FASTFMA: v_fma_f32 +define amdgpu_kernel void @fmul_fadd_contract_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { + %r0 = load volatile float, float addrspace(1)* %in1 + %r1 = load volatile float, float addrspace(1)* %in2 + %r2 = load volatile float, float addrspace(1)* %in3 + %mul = fmul float %r0, %r1 + %add = fadd contract float %mul, %r2 + store float %add, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], @@ -256,7 +274,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, %r1 = load volatile float, float addrspace(1)* %gep.0 %r2 = load volatile float, float addrspace(1)* %gep.1 - %r1.fneg = fsub float -0.000000e+00, %r1 + %r1.fneg = fneg float %r1 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) store float %r3, float addrspace(1)* %gep.out @@ -289,7 +307,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, flo %r1 = load volatile float, float addrspace(1)* %gep.0 %r2 = load volatile float, float addrspace(1)* %gep.1 - %r1.fneg = fsub float -0.000000e+00, %r1 + %r1.fneg = fneg float %r1 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) store float %r3, float addrspace(1)* %gep.out @@ -321,7 +339,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, flo %r1 = load volatile float, float addrspace(1)* %gep.0 %r2 = load volatile float, float addrspace(1)* %gep.1 - %r2.fneg = fsub float -0.000000e+00, %r2 + %r2.fneg = fneg float %r2 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) store float %r3, float addrspace(1)* %gep.out @@ -499,8 +517,8 @@ define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %a = load volatile float, float addrspace(1)* %gep0, align 4 %b = load volatile float, float addrspace(1)* %gep1, align 4 %c = load volatile float, float addrspace(1)* %gep2, align 4 - %nega = fsub float -0.000000e+00, %a - %negb = fsub float -0.000000e+00, %b + %nega = fneg float %a + %negb = fneg float %b %mul = fmul float %nega, %negb %sub = fadd float %mul, %c store float %sub, float addrspace(1)* %outgep, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll index 8d91a56ee4211..cdc4f3b5f31d0 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -1,4 +1,4 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s @@ -33,6 +33,20 @@ define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrs ret void } +; GCN-LABEL: {{^}}fmul_fadd_contract_f64: +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} + +define amdgpu_kernel void @fmul_fadd_contract_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %tmp = fmul double %r0, %r1 + %r3 = fadd contract double %tmp, %r2 + store double %r3, double addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}fadd_a_a_b_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]], ; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]], diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll index 95ea266c6eb72..74c91fbea44d1 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -27,6 +27,39 @@ define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x hal ret void } +; GCN-LABEL: {{^}}fmul_fadd_v2f16: +; GFX9-DENORM-STRICT: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9-DENORM-STRICT: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX9-DENORM-CONTRACT: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define amdgpu_kernel void @fmul_fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, + <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { + %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 + %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 + %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 + %r3 = fmul <2 x half> %r0, %r1 + %r4 = fadd <2 x half> %r3, %r2 + store <2 x half> %r4, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_fadd_contract_v2f16: +; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define amdgpu_kernel void @fmul_fadd_contract_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, + <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { + %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 + %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 + %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 + %r3 = fmul <2 x half> %r0, %r1 + %r4 = fadd contract <2 x half> %r3, %r2 + store <2 x half> %r4, <2 x half> addrspace(1)* %out + ret void +} + + ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16: ; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], ; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index 54ccc8fd870d4..01b0e6d17d1ac 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -26,7 +26,7 @@ define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrsp %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %add = fadd float %a, %b - %fneg = fsub float -0.000000e+00, %add + %fneg = fneg float %add store float %fneg, float addrspace(1)* %out.gep ret void } @@ -47,7 +47,7 @@ define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %add = fadd float %a, %b - %fneg = fsub float -0.000000e+00, %add + %fneg = fneg float %add store volatile float %fneg, float addrspace(1)* %out store volatile float %add, float addrspace(1)* %out ret void @@ -75,7 +75,7 @@ define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %add = fadd float %a, %b - %fneg = fsub float -0.000000e+00, %add + %fneg = fneg float %add %use1 = fmul float %add, 4.0 store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -100,9 +100,9 @@ define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %add = fadd float %fneg.a, %b - %fneg = fsub float -0.000000e+00, %add + %fneg = fneg float %add store volatile float %fneg, float addrspace(1)* %out ret void } @@ -124,9 +124,9 @@ define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.b = fsub float -0.000000e+00, %b + %fneg.b = fneg float %b %add = fadd float %a, %fneg.b - %fneg = fsub float -0.000000e+00, %add + %fneg = fneg float %add store volatile float %fneg, float addrspace(1)* %out ret void } @@ -148,10 +148,10 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a - %fneg.b = fsub float -0.000000e+00, %b + %fneg.a = fneg float %a + %fneg.b = fneg float %b %add = fadd float %fneg.a, %fneg.b - %fneg = fsub float -0.000000e+00, %add + %fneg = fneg float %add store volatile float %fneg, float addrspace(1)* %out ret void } @@ -177,9 +177,9 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* % %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %add = fadd float %fneg.a, %b - %fneg = fsub float -0.000000e+00, %add + %fneg = fneg float %add store volatile float %fneg, float addrspace(1)* %out store volatile float %fneg.a, float addrspace(1)* %out ret void @@ -205,9 +205,9 @@ define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* % %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %add = fadd float %fneg.a, %b - %fneg = fsub float -0.000000e+00, %add + %fneg = fneg float %add %use1 = fmul float %fneg.a, %c store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -226,7 +226,7 @@ define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 %.i188 = fadd float %tmp9, 0.000000e+00 %tmp10 = fcmp uge float %.i188, %tmp2 - %tmp11 = fsub float -0.000000e+00, %.i188 + %tmp11 = fneg float %.i188 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 %tmp12 = fcmp ule float %.i092, 0.000000e+00 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 @@ -249,7 +249,7 @@ define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 %.i188 = fadd float %tmp9, 0.000000e+00 %tmp10 = fcmp uge float %.i188, %tmp2 - %tmp11 = fsub float -0.000000e+00, %.i188 + %tmp11 = fneg float %.i188 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 %tmp12 = fcmp ule float %.i092, 0.000000e+00 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 @@ -274,7 +274,7 @@ define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrsp %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %mul = fmul float %a, %b - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store float %fneg, float addrspace(1)* %out.gep ret void } @@ -295,7 +295,7 @@ define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %mul = fmul float %a, %b - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out store volatile float %mul, float addrspace(1)* %out ret void @@ -318,7 +318,7 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %mul = fmul float %a, %b - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul %use1 = fmul float %mul, 4.0 store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -338,9 +338,9 @@ define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %mul = fmul float %fneg.a, %b - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out ret void } @@ -358,9 +358,9 @@ define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.b = fsub float -0.000000e+00, %b + %fneg.b = fneg float %b %mul = fmul float %a, %fneg.b - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out ret void } @@ -378,10 +378,10 @@ define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, fl %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a - %fneg.b = fsub float -0.000000e+00, %b + %fneg.a = fneg float %a + %fneg.b = fneg float %b %mul = fmul float %fneg.a, %fneg.b - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out ret void } @@ -402,9 +402,9 @@ define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* % %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %mul = fmul float %fneg.a, %b - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out store volatile float %fneg.a, float addrspace(1)* %out ret void @@ -425,9 +425,9 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* % %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %mul = fmul float %fneg.a, %b - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul %use1 = fmul float %fneg.a, %c store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -454,7 +454,7 @@ define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, floa %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %min = call float @llvm.minnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min store float %fneg, float addrspace(1)* %out.gep ret void } @@ -466,7 +466,7 @@ define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, floa ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 { %min = call float @llvm.minnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min ret float %fneg } @@ -509,7 +509,7 @@ define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %min = call float @llvm.minnum.f32(float 4.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min store float %fneg, float addrspace(1)* %out.gep ret void } @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 { %min = call float @llvm.minnum.f32(float 4.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min ret float %fneg } @@ -536,7 +536,7 @@ define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %min = call float @llvm.minnum.f32(float -4.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min store float %fneg, float addrspace(1)* %out.gep ret void } @@ -547,7 +547,7 @@ define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 { %min = call float @llvm.minnum.f32(float -4.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min ret float %fneg } @@ -562,7 +562,7 @@ define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float a %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %min = call float @llvm.minnum.f32(float 0.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min store float %fneg, float addrspace(1)* %out.gep ret void } @@ -579,7 +579,7 @@ define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %min = call float @llvm.minnum.f32(float -0.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min store float %fneg, float addrspace(1)* %out.gep ret void } @@ -602,7 +602,7 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, fl %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min store float %fneg, float addrspace(1)* %out.gep ret void } @@ -624,7 +624,7 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min store float %fneg, float addrspace(1)* %out.gep ret void } @@ -730,7 +730,7 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %ou ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 { %min = call float @llvm.minnum.f32(float -0.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min ret float %fneg } @@ -750,7 +750,7 @@ define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %min = call float @llvm.minnum.f32(float 0.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min %mul = fmul float %fneg, %b store float %mul, float addrspace(1)* %out.gep ret void @@ -779,7 +779,7 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min %mul = fmul float %fneg, %b store float %mul, float addrspace(1)* %out.gep ret void @@ -793,7 +793,7 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { %min = call float @llvm.minnum.f32(float 0.0, float %a) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min %mul = fmul float %fneg, %b ret float %mul } @@ -816,7 +816,7 @@ define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspa %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %min = call float @llvm.minnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min %use1 = fmul float %min, 4.0 store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -831,7 +831,7 @@ define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspa ; GCN-NEXT: ; return define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 { %min = call float @llvm.minnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %min + %fneg = fneg float %min %use1 = fmul float %min, 4.0 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 @@ -859,7 +859,7 @@ define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, floa %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %max = call float @llvm.maxnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max store float %fneg, float addrspace(1)* %out.gep ret void } @@ -871,7 +871,7 @@ define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, floa ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 { %max = call float @llvm.maxnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max ret float %fneg } @@ -914,7 +914,7 @@ define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %max = call float @llvm.maxnum.f32(float 4.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max store float %fneg, float addrspace(1)* %out.gep ret void } @@ -925,7 +925,7 @@ define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 { %max = call float @llvm.maxnum.f32(float 4.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max ret float %fneg } @@ -941,7 +941,7 @@ define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %max = call float @llvm.maxnum.f32(float -4.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max store float %fneg, float addrspace(1)* %out.gep ret void } @@ -952,7 +952,7 @@ define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 { %max = call float @llvm.maxnum.f32(float -4.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max ret float %fneg } @@ -967,7 +967,7 @@ define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float a %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %max = call float @llvm.maxnum.f32(float 0.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max store float %fneg, float addrspace(1)* %out.gep ret void } @@ -984,7 +984,7 @@ define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %max = call float @llvm.maxnum.f32(float -0.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max store float %fneg, float addrspace(1)* %out.gep ret void } @@ -995,7 +995,7 @@ define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 { %max = call float @llvm.maxnum.f32(float -0.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max ret float %fneg } @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %max = call float @llvm.maxnum.f32(float 0.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max %mul = fmul float %fneg, %b store float %mul, float addrspace(1)* %out.gep ret void @@ -1029,7 +1029,7 @@ define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace ; GCN-NEXT: ; return define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { %max = call float @llvm.maxnum.f32(float 0.0, float %a) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max %mul = fmul float %fneg, %b ret float %mul } @@ -1052,7 +1052,7 @@ define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspa %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %max = call float @llvm.maxnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max %use1 = fmul float %max, 4.0 store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -1067,7 +1067,7 @@ define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspa ; GCN-NEXT: ; return define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 { %max = call float @llvm.maxnum.f32(float %a, float %b) - %fneg = fsub float -0.000000e+00, %max + %fneg = fneg float %max %use1 = fmul float %max, 4.0 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 @@ -1099,7 +1099,7 @@ define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrsp %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep %fma = call float @llvm.fma.f32(float %a, float %b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store float %fneg, float addrspace(1)* %out.gep ret void } @@ -1123,7 +1123,7 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep %fma = call float @llvm.fma.f32(float %a, float %b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store volatile float %fneg, float addrspace(1)* %out store volatile float %fma, float addrspace(1)* %out ret void @@ -1154,7 +1154,7 @@ define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep %fma = call float @llvm.fma.f32(float %a, float %b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma %use1 = fmul float %fma, 4.0 store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -1182,9 +1182,9 @@ define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, flo %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store volatile float %fneg, float addrspace(1)* %out ret void } @@ -1210,9 +1210,9 @@ define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, flo %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep - %fneg.b = fsub float -0.000000e+00, %b + %fneg.b = fneg float %b %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store volatile float %fneg, float addrspace(1)* %out ret void } @@ -1238,10 +1238,10 @@ define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep - %fneg.a = fsub float -0.000000e+00, %a - %fneg.b = fsub float -0.000000e+00, %b + %fneg.a = fneg float %a + %fneg.b = fneg float %b %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store volatile float %fneg, float addrspace(1)* %out ret void } @@ -1267,10 +1267,10 @@ define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep - %fneg.a = fsub float -0.000000e+00, %a - %fneg.c = fsub float -0.000000e+00, %c + %fneg.a = fneg float %a + %fneg.c = fneg float %c %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store volatile float %fneg, float addrspace(1)* %out ret void } @@ -1296,9 +1296,9 @@ define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, flo %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep - %fneg.c = fsub float -0.000000e+00, %c + %fneg.c = fneg float %c %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store volatile float %fneg, float addrspace(1)* %out ret void } @@ -1330,9 +1330,9 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store volatile float %fneg, float addrspace(1)* %out store volatile float %fneg.a, float addrspace(1)* %out ret void @@ -1360,9 +1360,9 @@ define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma %use1 = fmul float %fneg.a, %d store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -1394,7 +1394,7 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma store float %fneg, float addrspace(1)* %out.gep ret void } @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %o %b = load volatile float, float addrspace(1)* %b.gep %c = load volatile float, float addrspace(1)* %c.gep %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) - %fneg = fsub float -0.000000e+00, %fma + %fneg = fneg float %fma %use1 = fmul float %fma, 4.0 store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -1483,7 +1483,7 @@ define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %fpext = fpext float %fneg.a to double %fneg = fsub double -0.000000e+00, %fpext store double %fneg, double addrspace(1)* %out.gep @@ -1502,7 +1502,7 @@ define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double add %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %fpext = fpext float %fneg.a to double %fneg = fsub double -0.000000e+00, %fpext store volatile double %fneg, double addrspace(1)* %out.gep @@ -1559,7 +1559,7 @@ define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addr %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile half, half addrspace(1)* %a.gep %fpext = fpext half %a to float - %fneg = fsub float -0.000000e+00, %fpext + %fneg = fneg float %fpext store volatile float %fneg, float addrspace(1)* %out.gep store volatile float %fpext, float addrspace(1)* %out.gep ret void @@ -1573,7 +1573,7 @@ define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(f %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile half, half addrspace(1)* %a.gep %fpext = fpext half %a to float - %fneg = fsub float -0.000000e+00, %fpext + %fneg = fneg float %fpext %mul = fmul float %fpext, 4.0 store volatile float %fneg, float addrspace(1)* %out.gep store volatile float %mul, float addrspace(1)* %out.gep @@ -1595,7 +1595,7 @@ define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile double, double addrspace(1)* %a.gep %fpround = fptrunc double %a to float - %fneg = fsub float -0.000000e+00, %fpround + %fneg = fneg float %fpround store float %fneg, float addrspace(1)* %out.gep ret void } @@ -1612,7 +1612,7 @@ define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* % %a = load volatile double, double addrspace(1)* %a.gep %fneg.a = fsub double -0.000000e+00, %a %fpround = fptrunc double %fneg.a to float - %fneg = fsub float -0.000000e+00, %fpround + %fneg = fneg float %fpround store float %fneg, float addrspace(1)* %out.gep ret void } @@ -1631,7 +1631,7 @@ define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrs %a = load volatile double, double addrspace(1)* %a.gep %fneg.a = fsub double -0.000000e+00, %a %fpround = fptrunc double %fneg.a to float - %fneg = fsub float -0.000000e+00, %fpround + %fneg = fneg float %fpround store volatile float %fneg, float addrspace(1)* %out.gep store volatile double %fneg.a, double addrspace(1)* undef ret void @@ -1652,7 +1652,7 @@ define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrs %a = load volatile double, double addrspace(1)* %a.gep %fneg.a = fsub double -0.000000e+00, %a %fpround = fptrunc double %fneg.a to float - %fneg = fsub float -0.000000e+00, %fpround + %fneg = fneg float %fpround %use1 = fmul double %fneg.a, %c store volatile float %fneg, float addrspace(1)* %out.gep store volatile double %use1, double addrspace(1)* undef @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %o %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %fpround = fptrunc float %fneg.a to half %fneg = fsub half -0.000000e+00, %fpround store half %fneg, half addrspace(1)* %out.gep @@ -1705,7 +1705,7 @@ define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrs %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile double, double addrspace(1)* %a.gep %fpround = fptrunc double %a to float - %fneg = fsub float -0.000000e+00, %fpround + %fneg = fneg float %fpround store volatile float %fneg, float addrspace(1)* %out.gep store volatile float %fpround, float addrspace(1)* %out.gep ret void @@ -1723,7 +1723,7 @@ define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrsp %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %fpround = fptrunc float %fneg.a to half %fneg = fsub half -0.000000e+00, %fpround store volatile half %fneg, half addrspace(1)* %out.gep @@ -1743,7 +1743,7 @@ define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrsp %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %fpround = fptrunc float %fneg.a to half %fneg = fsub half -0.000000e+00, %fpround %use1 = fmul float %fneg.a, %c @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrsp %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %rcp = call float @llvm.amdgcn.rcp.f32(float %a) - %fneg = fsub float -0.000000e+00, %rcp + %fneg = fneg float %rcp store float %fneg, float addrspace(1)* %out.gep ret void } @@ -1782,9 +1782,9 @@ define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float a %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) - %fneg = fsub float -0.000000e+00, %rcp + %fneg = fneg float %rcp store float %fneg, float addrspace(1)* %out.gep ret void } @@ -1801,9 +1801,9 @@ define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %ou %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) - %fneg = fsub float -0.000000e+00, %rcp + %fneg = fneg float %rcp store volatile float %fneg, float addrspace(1)* %out.gep store volatile float %fneg.a, float addrspace(1)* undef ret void @@ -1821,9 +1821,9 @@ define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %ou %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) - %fneg = fsub float -0.000000e+00, %rcp + %fneg = fneg float %rcp %use1 = fmul float %fneg.a, %c store volatile float %fneg, float addrspace(1)* %out.gep store volatile float %use1, float addrspace(1)* undef @@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store float %fneg, float addrspace(1)* %out.gep ret void } @@ -1869,7 +1869,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out store volatile float %mul, float addrspace(1)* %out ret void @@ -1891,7 +1891,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addr %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0) store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -1911,9 +1911,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out ret void } @@ -1931,9 +1931,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.b = fsub float -0.000000e+00, %b + %fneg.b = fneg float %b %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b) - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out ret void } @@ -1951,10 +1951,10 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* % %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a - %fneg.b = fsub float -0.000000e+00, %b + %fneg.a = fneg float %a + %fneg.b = fneg float %b %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b) - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out ret void } @@ -1974,9 +1974,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspac %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul store volatile float %fneg, float addrspace(1)* %out store volatile float %fneg.a, float addrspace(1)* %out ret void @@ -1997,9 +1997,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspac %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %b = load volatile float, float addrspace(1)* %b.gep - %fneg.a = fsub float -0.000000e+00, %a + %fneg.a = fneg float %a %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) - %fneg = fsub float -0.000000e+00, %mul + %fneg = fneg float %mul %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c) store volatile float %fneg, float addrspace(1)* %out store volatile float %use1, float addrspace(1)* %out @@ -2023,7 +2023,7 @@ define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrsp %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext %a = load volatile float, float addrspace(1)* %a.gep %sin = call float @llvm.sin.f32(float %a) - %fneg = fsub float -0.000000e+00, %sin + %fneg = fneg float %sin store float %fneg, float addrspace(1)* %out.gep ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir index c53b817344c50..9164e5e267914 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -59,3 +59,31 @@ body: | S_ENDPGM 0, implicit %1, implicit %2 ... + +# GCN-LABEL: name: no_fold_imm_into_m0{{$}} +# GCN: %0:sreg_32 = S_MOV_B32 -8 +# GCN-NEXT: $m0 = COPY %0 + +--- +name: no_fold_imm_into_m0 +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 -8 + $m0 = COPY %0 + S_ENDPGM 0, implicit $m0 + +... + +# GCN-LABEL: name: fold_sgpr_imm_to_vgpr_copy{{$}} +# GCN: $vgpr0 = V_MOV_B32_e32 -8, implicit $exec +--- +name: fold_sgpr_imm_to_vgpr_copy +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 -8 + $vgpr0 = COPY %0 + S_ENDPGM 0, implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 2dadb5e80dfd7..bacbfcb8f500d 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -767,21 +767,21 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; SI-NEXT: v_lshl_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mov_b32_e32 v9, 0xffff ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_and_b32_e32 v3, 15, v8 +; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3 +; SI-NEXT: v_and_b32_e32 v10, s4, v5 +; SI-NEXT: v_lshr_b32_e32 v4, v10, v3 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v6 +; SI-NEXT: v_mov_b32_e32 v9, 0xffff +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, v9, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 15, v8 -; SI-NEXT: v_sub_i32_e32 v4, vcc, 16, v1 -; SI-NEXT: v_and_b32_e32 v10, s4, v5 -; SI-NEXT: v_lshr_b32_e32 v3, v10, v1 -; SI-NEXT: v_lshl_b32_e32 v2, v2, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; SI-NEXT: v_and_b32_e32 v2, v9, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, v9, v3 +; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v3i16: @@ -865,46 +865,46 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0xffff -; SI-NEXT: v_and_b32_e32 v11, 15, v11 -; SI-NEXT: v_and_b32_e32 v16, s4, v7 -; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v11 -; SI-NEXT: v_lshr_b32_e32 v16, v16, v11 -; SI-NEXT: v_lshl_b32_e32 v3, v3, v17 -; SI-NEXT: v_or_b32_e32 v3, v3, v16 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; SI-NEXT: v_and_b32_e32 v9, 15, v9 +; SI-NEXT: v_and_b32_e32 v16, s4, v5 +; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v9 +; SI-NEXT: v_lshr_b32_e32 v16, v16, v9 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SI-NEXT: v_and_b32_e32 v5, 15, v8 +; SI-NEXT: v_sub_i32_e32 v9, vcc, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, s4, v4 +; SI-NEXT: v_lshr_b32_e32 v8, v15, v5 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_and_b32_e32 v4, 15, v11 +; SI-NEXT: v_sub_i32_e32 v8, vcc, 16, v4 +; SI-NEXT: v_and_b32_e32 v14, s4, v7 +; SI-NEXT: v_lshr_b32_e32 v5, v14, v4 +; SI-NEXT: v_lshl_b32_e32 v3, v3, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v4, 15, v10 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; SI-NEXT: v_and_b32_e32 v7, 15, v10 -; SI-NEXT: v_sub_i32_e32 v11, vcc, 16, v7 -; SI-NEXT: v_and_b32_e32 v15, s4, v6 -; SI-NEXT: v_lshr_b32_e32 v10, v15, v7 -; SI-NEXT: v_lshl_b32_e32 v2, v2, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, s4, v6 +; SI-NEXT: v_lshr_b32_e32 v5, v13, v4 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; SI-NEXT: v_mov_b32_e32 v12, 0xffff ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, v12, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 15, v9 -; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v3 -; SI-NEXT: v_and_b32_e32 v14, s4, v5 -; SI-NEXT: v_lshr_b32_e32 v6, v14, v3 -; SI-NEXT: v_lshl_b32_e32 v1, v1, v7 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_and_b32_e32 v3, 15, v8 -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3 -; SI-NEXT: v_and_b32_e32 v13, s4, v4 -; SI-NEXT: v_lshr_b32_e32 v5, v13, v3 -; SI-NEXT: v_lshl_b32_e32 v0, v0, v6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, v12, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1214,9 +1214,9 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: s_mov_b32 s4, 0xffffff ; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v0 @@ -1231,8 +1231,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v3, s4, v3 ; SI-NEXT: v_mul_hi_u32 v13, v3, s5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, s4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, s4, v5 ; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 ; SI-NEXT: v_mul_lo_u32 v12, v12, 24 ; SI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 @@ -1243,18 +1243,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; SI-NEXT: v_sub_i32_e32 v13, vcc, 24, v2 ; SI-NEXT: v_sub_i32_e32 v14, vcc, 24, v3 ; SI-NEXT: v_and_b32_e32 v13, s4, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshl_b32_e32 v5, v5, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshl_b32_e32 v6, v6, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 ; SI-NEXT: v_lshr_b32_e32 v11, v11, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b32_e32 v6, v6, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_lshl_b32_e32 v4, v4, v14 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; SI-NEXT: v_or_b32_e32 v4, v4, v11 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen ; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 @@ -1274,9 +1273,9 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; VI-NEXT: s_mov_b32 s4, 0xffffff ; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab ; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v0 @@ -1291,8 +1290,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_and_b32_e32 v3, s4, v3 ; VI-NEXT: v_mul_hi_u32 v13, v3, s5 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_and_b32_e32 v11, s4, v4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_and_b32_e32 v11, s4, v5 ; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 ; VI-NEXT: v_mul_lo_u32 v12, v12, 24 ; VI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 @@ -1303,18 +1302,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; VI-NEXT: v_sub_u32_e32 v13, vcc, 24, v2 ; VI-NEXT: v_sub_u32_e32 v14, vcc, 24, v3 ; VI-NEXT: v_and_b32_e32 v13, s4, v13 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v5, v13, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v6, v13, v6 ; VI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 ; VI-NEXT: v_lshrrev_b32_e32 v11, v3, v11 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v6, v14, v6 -; VI-NEXT: v_or_b32_e32 v5, v5, v12 +; VI-NEXT: v_lshlrev_b32_e32 v4, v14, v4 +; VI-NEXT: v_or_b32_e32 v6, v6, v12 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; VI-NEXT: v_or_b32_e32 v4, v4, v11 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; VI-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen ; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 @@ -1329,44 +1327,42 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX9-LABEL: v_fshr_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; GFX9-NEXT: s_mov_b32 s4, 0xffffff ; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v10, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v2, s5 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, s5 +; GFX9-NEXT: v_mul_hi_u32 v7, v2, s5 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v9, s4, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, v2, v10 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v7 -; GFX9-NEXT: v_sub_u32_e32 v7, 24, v2 -; GFX9-NEXT: v_sub_u32_e32 v10, 24, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v10, s4, v8 +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, v1, v10 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 +; GFX9-NEXT: v_sub_u32_e32 v7, 24, v1 +; GFX9-NEXT: v_sub_u32_e32 v10, 24, v2 ; GFX9-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, v3, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, v2, v9 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffffff, v10 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshl_or_b32 v5, v5, v7, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc +; GFX9-NEXT: v_lshl_or_b32 v3, v3, v10, v9 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v6, v8, v10, v9 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 ; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 9a88c82b5a85e..e7d48de6ed606 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -1470,21 +1470,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018 ; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014 ; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010 -; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40000 -; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004 -; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40008 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x4000c +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40000 ; GFX7-NEXT: s_ashr_i32 s14, s5, 28 -; GFX7-NEXT: s_bfe_i32 s5, s5, 0x4000c +; GFX7-NEXT: s_bfe_i32 s5, s5, 0x40004 ; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014 ; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c ; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40000 ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c +; GFX7-NEXT: s_bfe_i32 s4, s4, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2 @@ -1494,17 +1494,17 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, s8, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v5, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 171cb6ac6ea77..ef0825120f20d 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -1988,17 +1988,17 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40004 -; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40004 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 ; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 ; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_and_b32 s18, s5, 15 +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 ; GFX7-NEXT: s_lshr_b32 s13, s5, 28 -; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 @@ -2006,25 +2006,25 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 ; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX7-NEXT: s_and_b32 s11, s4, 15 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 -; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40008 +; GFX7-NEXT: s_and_b32 s4, s4, 15 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, s16 ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index bca00f69e25cd..bfc42e48bca86 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -273,8 +273,8 @@ entry: ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} ; MOVREL: v_movreld_b32_e32 v0, 5 -; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} -; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST) +; IDXMODE: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00{{$}} +; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST) ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) { diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index e26b1c9471049..3d3b511ab34b7 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -88,7 +88,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN: renamable $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit-def undef $mode, implicit $m0, implicit $mode ; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5) - ; GCN: renamable $vgpr18 = V_MOV_B32_e32 undef $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0 + ; GCN: renamable $vgpr18 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0 ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ; GCN: renamable $vgpr19 = COPY renamable $vgpr18 ; GCN: renamable $sgpr2_sgpr3 = COPY renamable $sgpr4_sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 72f8c9cfc55ff..9aee862dbe119 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -855,10 +855,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 -; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 -; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 ; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 +; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll index ce1551e44e51b..2da96c4480608 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll @@ -133,6 +133,17 @@ define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 ret void } +; Make sure this selects successfully with no use. The result register needs to be constrained. +; GCN-LABEL: {{^}}ds_append_lds_no_use: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}} +define amdgpu_kernel void @ds_append_lds_no_use(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 + %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) + ret void +} + declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1 declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll index 175c0cf7760ac..40f20bc795222 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll @@ -127,6 +127,17 @@ define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 ret void } +; Make sure this selects successfully with no use. The result register needs to be constrained. +; GCN-LABEL: {{^}}ds_consume_lds_no_use: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}} +define amdgpu_kernel void @ds_consume_lds_no_use(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 + %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) + ret void +} + declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1 declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll index 0d3e3b7efa258..db7949f540964 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll @@ -4,8 +4,8 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX10,GFX10-PACKED %s ; GCN-LABEL: {{^}}tbuffer_load_d16_x: -; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 -; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0 +; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { main_body: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) @@ -13,11 +13,11 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_d16_xy: -; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 -; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], format:22, 0 +; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { main_body: @@ -27,12 +27,12 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: -; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 -; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0 +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 -; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:22, 0 +; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] +; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll index 1aff4b9bc7e2f..462c9dcf1a239 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -3,14 +3,14 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s ; GCN-LABEL: {{^}}tbuffer_load: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 glc -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:63, 0 glc -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:22, 0 slc -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:22, 0 glc dlc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] glc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] glc +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_SINT] glc +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] slc +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] glc dlc ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: @@ -29,8 +29,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_immoffs: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 offset:42 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offset:42 define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) @@ -39,12 +39,12 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1 -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:47, 61 offset:4095 -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:62, {{s[0-9]+}} offset:73 -; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:77, {{s[0-9]+}} offset:1 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095 +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] offset:73 +; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_FLOAT] offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0) @@ -60,8 +60,8 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l } ; GCN-LABEL: {{^}}tbuffer_load_ofs: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 offen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offen define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0) @@ -70,8 +70,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 offen offset:52 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offen offset:52 define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { main_body: %ofs = add i32 %voffs, 52 @@ -81,8 +81,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_load_xy: -; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 -; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:77, 0 +; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] +; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -90,8 +90,8 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { } ; GCN-LABEL: {{^}}buffer_load_x: -; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 -; GFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, format:77, 0 +; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] +; GFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index b337714f27b1c..5041cf3197342 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -8,8 +8,8 @@ ; GCN-DAG: s_load_dwordx4 ; GCN-DAG: s_load_dword s[[S_LO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] -; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 -; GFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], format:33, 0 +; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -22,10 +22,10 @@ main_body: ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] -; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 -; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], format:33, 0 +; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) @@ -43,13 +43,13 @@ main_body: ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] -; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] -; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 -; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], format:33, 0 +; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { main_body: call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll index 9c88de9c4f88e..6bc79150b8af5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll @@ -3,14 +3,14 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}tbuffer_store: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0 -; PREGFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc -; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc -; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 glc -; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:44, 0 -; GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], format:61, 0 glc -; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 slc -; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], format:78, 0 glc dlc +; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] +; PREGFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] glc +; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] slc +; PREGFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] glc +; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_UNORM] +; GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] glc +; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc +; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -24,8 +24,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_immoffs: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 -; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, 0 offset:42 +; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 +; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -34,8 +34,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42 -; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], format:117, {{s[0-9]+}} offset:42 +; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 +; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:117 offset:42 define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -44,8 +44,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_ofs: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen -; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:115, 0 offen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen +; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -54,8 +54,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x1: -; PREGFX10: tbuffer_store_format_x v0, off, s[0:3], dfmt:13, nfmt:7, 0 -; GFX10: tbuffer_store_format_x v0, off, s[0:3], format:125, 0 +; PREGFX10: tbuffer_store_format_x v0, off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] +; GFX10: tbuffer_store_format_x v0, off, s[0:3], 0 format:125 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) { main_body: %data.i = bitcast float %data to i32 @@ -64,8 +64,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x2: -; PREGFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], dfmt:1, nfmt:2, 0 -; GFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], format:33, 0 +; PREGFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] +; GFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll index 906a8a3e05f44..69cbf62c538d2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll @@ -2,6 +2,11 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + + ; GCN-LABEL: {{^}}s_getreg_test: ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23) define amdgpu_kernel void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size. @@ -21,7 +26,7 @@ define amdgpu_kernel void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; si ret void } -declare i32 @llvm.amdgcn.s.getreg(i32) #0 +declare i32 @llvm.amdgcn.s.getreg(i32 immarg) #0 attributes #0 = { nounwind readonly } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll index 36aa5012a17c9..2fd21a10564d4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll @@ -5,8 +5,8 @@ ; GCN-LABEL: {{^}}tbuffer_load_d16_x: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen -; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen +; PREGFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen +; GFX10: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { main_body: %data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) @@ -15,11 +15,11 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_d16_xy: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen ; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen -; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen +; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen +; GFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { main_body: @@ -30,11 +30,11 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen ; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen -; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], format:22, 0 idxen +; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen +; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll index 1150347650c8d..ac18761534ea2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -4,14 +4,14 @@ ; GCN-LABEL: {{^}}tbuffer_load: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 idxen glc -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen slc -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen glc -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:63, 0 idxen glc -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:22, 0 idxen slc -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:22, 0 idxen glc dlc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen glc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen slc +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen glc +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_SINT] idxen glc +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen slc +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen glc dlc ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: @@ -31,8 +31,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_immoffs: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen offset:42 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offset:42 define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) @@ -42,12 +42,12 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 idxen offset:4095 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} idxen offset:73 -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} idxen offset:1 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:47, 61 idxen offset:4095 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:62, {{s[0-9]+}} idxen offset:73 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, format:77, {{s[0-9]+}} idxen offset:1 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen offset:4095 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] idxen offset:73 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:1 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] idxen offset:4095 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] idxen offset:73 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0) @@ -63,8 +63,8 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l } ; GCN-LABEL: {{^}}tbuffer_load_idx: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0) @@ -73,8 +73,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen offen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0) @@ -83,8 +83,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen offset:52 -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen offen offset:52 +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen offset:52 +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen offset:52 define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { main_body: %ofs = add i32 %voffs, 52 @@ -94,8 +94,8 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_both: -; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen -; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, format:78, 0 idxen offen +; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen +; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0) @@ -105,8 +105,8 @@ main_body: ; GCN-LABEL: {{^}}buffer_load_xy: -; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen -; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:77, 0 idxen +; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen +; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -114,8 +114,8 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { } ; GCN-LABEL: {{^}}buffer_load_x: -; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen -; GFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, format:77, 0 idxen +; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen +; GFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 519eef4cb5b7c..ca78b29cc8f53 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -8,8 +8,8 @@ ; GCN-DAG: s_load_dwordx4 ; GCN-DAG: s_load_dword{{[x0-2]*}} s{{\[}}[[S_LO:[0-9]+]] ; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] -; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], format:33, 0 idxen +; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -22,10 +22,10 @@ main_body: ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], format:33, 0 idxen +; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) @@ -43,12 +43,12 @@ main_body: ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] -; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] -; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], format:33, 0 idxen +; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll index 254ad80c2bd9b..24247d320c88a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -4,14 +4,14 @@ ; GCN-LABEL: {{^}}tbuffer_store: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:12, nfmt:2, 0 idxen -; PREGFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], dfmt:13, nfmt:3, 0 idxen glc -; PREGFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen slc -; PREGFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen glc -; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], format:44, 0 idxen -; GFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], format:61, 0 idxen glc -; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], format:78, 0 idxen slc -; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], format:78, 0 idxen glc dlc +; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] idxen +; PREGFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] idxen glc +; PREGFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen slc +; PREGFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc +; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:[BUF_FMT_10_10_10_2_UNORM] idxen +; GFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen glc +; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:78 idxen slc +; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:78 idxen glc dlc define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -26,8 +26,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_immoffs: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, 0 idxen offset:42 -; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], format:117, 0 idxen offset:42 +; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 +; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:117 idxen offset:42 define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -37,8 +37,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} idxen offset:42 -; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], format:117, {{s[0-9]+}} idxen offset:42 +; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 +; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], {{s[0-9]+}} format:117 idxen offset:42 define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -47,8 +47,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_idx: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen -; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:47, 0 idxen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen +; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SSCALED] idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -57,8 +57,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_ofs: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], dfmt:3, nfmt:7, 0 idxen offen -; GFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], format:115, 0 idxen offen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] idxen offen +; GFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 format:115 idxen offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -67,8 +67,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_both: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen -; GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], format:70, 0 idxen offen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen +; GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_16_16_16_16_SINT] idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -79,13 +79,13 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; ; GCN-LABEL: {{^}}buffer_store_wait: -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen -; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], format:63, 0 idxen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen +; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_SINT] idxen ; VERDE: s_waitcnt expcnt(0) ; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GCN: s_waitcnt vmcnt(0) -; PREGFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen -; GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], format:46, 0 idxen +; PREGFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen +; GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_10_10_10_2_USCALED] idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -97,8 +97,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x1: -; PREGFX10: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen -; GFX10: tbuffer_store_format_x v0, v1, s[0:3], format:125, 0 idxen +; PREGFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen +; GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: %data.i = bitcast float %data to i32 @@ -107,8 +107,8 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x2: -; PREGFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen -; GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], format:33, 0 idxen +; PREGFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen +; GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll index 8850acae1b9cc..205cc5f78d335 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s ; GCN-LABEL: {{^}}tbuffer_load_d16_x: -; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { main_body: %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) @@ -11,10 +11,10 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_d16_xy: -; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { main_body: @@ -24,10 +24,10 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: -; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll index 0193d97398910..b04f3c09729a0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll @@ -2,8 +2,8 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GCNX3 ; GCN-LABEL: {{^}}tbuffer_raw_load_immoffs_x3: -; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 -; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 define amdgpu_vs <3 x float> @tbuffer_raw_load_immoffs_x3(<4 x i32> inreg) { main_body: %vdata = call <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) @@ -14,8 +14,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_struct_load_immoffs_x3: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 -; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 define amdgpu_vs <3 x float> @tbuffer_struct_load_immoffs_x3(<4 x i32> inreg) { main_body: %vdata = call <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) @@ -25,8 +25,8 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_load_format_immoffs_x3: -; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 -; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 +; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 define amdgpu_vs <3 x float> @tbuffer_load_format_immoffs_x3(<4 x i32> inreg) { main_body: %vdata = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll index 712ee7ad1e5cb..aca9e0aaf9a26 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll @@ -2,10 +2,10 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}tbuffer_load: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] glc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: @@ -24,7 +24,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_immoffs: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) @@ -33,9 +33,9 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_immoffs_large -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095 -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73 -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0) @@ -51,7 +51,7 @@ define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_l } ; GCN-LABEL: {{^}}tbuffer_load_idx: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) @@ -60,7 +60,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) @@ -69,7 +69,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52 define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0) @@ -78,7 +78,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_load_both: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) @@ -88,7 +88,7 @@ main_body: ; GCN-LABEL: {{^}}buffer_load_xy: -; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -96,7 +96,7 @@ define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { } ; GCN-LABEL: {{^}}buffer_load_x: -; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll index 85aaee3dd2e38..4dd76a3a632dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_x: ; GCN: s_load_dword s[[S_LO:[0-9]+]] ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] -; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) @@ -19,9 +19,9 @@ main_body: ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) @@ -39,11 +39,11 @@ main_body: ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] -; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] -; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll index a39614c1cf052..dc5abf418131c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN ; GCN-LABEL: {{^}}tbuffer_raw_store_immoffs_x3: -; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 define amdgpu_ps void @tbuffer_raw_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> @@ -12,7 +12,7 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_struct_store_immoffs_x3: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 -; GCN: tbuffer_store_format_xyz v[0:2], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, 0 idxen offset:42 +; GCN: tbuffer_store_format_xyz v[0:2], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 define amdgpu_ps void @tbuffer_struct_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> @@ -21,7 +21,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_immoffs_x3: -; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 define amdgpu_ps void @tbuffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { main_body: %in1 = bitcast <3 x float> %1 to <3 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll index 4caa8081530ba..ed31ea3c40fea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll @@ -2,10 +2,10 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}tbuffer_store: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0 -; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc -; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc -; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] +; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] glc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] slc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -19,7 +19,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_immoffs: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -28,7 +28,7 @@ main_body: } ; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42 +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -37,7 +37,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_idx: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -46,7 +46,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_ofs: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -55,7 +55,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_both: -; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen +; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -66,11 +66,11 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; ; GCN-LABEL: {{^}}buffer_store_wait: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen ; VERDE: s_waitcnt expcnt(0) ; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GCN: s_waitcnt vmcnt(0) -; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -82,7 +82,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x1: -; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen +; GCN: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: %data.i = bitcast float %data to i32 @@ -91,7 +91,7 @@ main_body: } ; GCN-LABEL: {{^}}buffer_store_x2: -; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 76a218760e8e8..667425d86c692 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -237,157 +237,157 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s18, 0xfc01 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_add_i32 s19, s0, s18 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 -; SI-NEXT: s_brev_b32 s20, 1 -; SI-NEXT: s_andn2_b64 s[16:17], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s20 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v0, s17 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s19, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s19, 51 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s17, s0, s18 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_brev_b32 s16, -2 -; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v4, s11 -; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s20 -; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s17, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s17, 51 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] -; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 -; SI-NEXT: s_add_i32 s10, s0, s18 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[0:1] -; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 -; SI-NEXT: s_and_b32 s0, s15, s20 -; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc -; SI-NEXT: v_mov_b32_e32 v5, s0 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s10, 51 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] -; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v10, s15 -; SI-NEXT: s_add_i32 s8, s0, s18 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s8 -; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 -; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[0:1] -; SI-NEXT: s_and_b32 s0, s13, s20 -; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; SI-NEXT: v_mov_b32_e32 v5, s0 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s8, 51 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v10, s12 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v13, s13 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v12, s16, v12, v13 -; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc -; SI-NEXT: v_mov_b32_e32 v10, 0 -; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9] -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_movk_i32 s18, 0xfc01 +; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 +; SI-NEXT: s_add_i32 s19, s0, s18 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 +; SI-NEXT: s_brev_b32 s20, 1 +; SI-NEXT: s_andn2_b64 s[16:17], s[10:11], s[0:1] +; SI-NEXT: s_and_b32 s0, s11, s20 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s17 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s19, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s19, 51 +; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 +; SI-NEXT: s_add_i32 s17, s0, s18 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; SI-NEXT: s_brev_b32 s16, -2 +; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 +; SI-NEXT: v_mov_b32_e32 v4, s11 +; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[0:1] +; SI-NEXT: s_and_b32 s0, s9, s20 +; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s17, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s17, 51 +; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] +; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 +; SI-NEXT: s_add_i32 s10, s0, s18 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[0:1] +; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 +; SI-NEXT: s_and_b32 s0, s15, s20 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc +; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s10, 51 +; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] +; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] +; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v10, s15 +; SI-NEXT: s_add_i32 s8, s0, s18 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s8 +; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 +; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[0:1] +; SI-NEXT: s_and_b32 s0, s13, s20 +; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] +; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0 +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s8, 51 +; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] +; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 +; SI-NEXT: v_bfi_b32 v12, s16, v12, v13 +; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc +; SI-NEXT: v_mov_b32_e32 v10, 0 +; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_mov_b32_e32 v4, s11 -; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] -; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; CI-NEXT: v_mov_b32_e32 v2, 0 -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v4, s9 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 -; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[14:15] -; CI-NEXT: v_mov_b32_e32 v10, s15 -; CI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; CI-NEXT: v_bfi_b32 v10, s2, v12, v10 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v6, 0 -; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] -; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v13, s13 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v12, s2, v12, v13 -; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc -; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] -; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_endpgm +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_brev_b32 s12, -2 +; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v4, s7 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] +; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v4, s5 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 +; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[10:11] +; CI-NEXT: v_mov_b32_e32 v10, s11 +; CI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] +; CI-NEXT: v_bfi_b32 v10, s12, v12, v10 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v6, 0 +; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] +; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] +; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v13, s9 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; CI-NEXT: v_bfi_b32 v12, s12, v12, v13 +; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, <4 x double> addrspace(1)* %out ret void @@ -600,82 +600,82 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_mov_b32_e32 v4, s11 -; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v2, 0 -; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[4:5] -; CI-NEXT: v_mov_b32_e32 v6, s9 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 -; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] -; CI-NEXT: v_add_f64 v[4:5], s[14:15], -v[6:7] -; CI-NEXT: v_mov_b32_e32 v8, s15 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 -; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[12:13] -; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v10, s13 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 -; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc -; CI-NEXT: v_mov_b32_e32 v4, 0 -; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v8, s19 -; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[20:21] -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[22:23] -; CI-NEXT: v_add_f64 v[14:15], s[20:21], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v19, s23 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[22:23], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v17, s21 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 -; CI-NEXT: v_trunc_f64_e32 v[12:13], s[16:17] -; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] -; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] -; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc -; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_mov_b32_e32 v17, s17 -; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] -; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[12:13] -; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 -; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[16:17] -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] -; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[14:15] -; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc -; CI-NEXT: v_mov_b32_e32 v16, 0 -; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_endpgm +; CI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] +; CI-NEXT: v_mov_b32_e32 v4, s11 +; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v6, s9 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 +; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] +; CI-NEXT: v_add_f64 v[4:5], s[14:15], -v[6:7] +; CI-NEXT: v_mov_b32_e32 v8, s15 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 +; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[12:13] +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] +; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v10, s13 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 +; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] +; CI-NEXT: v_mov_b32_e32 v8, s19 +; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[20:21] +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[22:23] +; CI-NEXT: v_add_f64 v[14:15], s[20:21], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v19, s23 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 +; CI-NEXT: v_add_f64 v[14:15], s[22:23], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v17, s21 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 +; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 +; CI-NEXT: v_trunc_f64_e32 v[12:13], s[16:17] +; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] +; CI-NEXT: v_mov_b32_e32 v14, 0 +; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] +; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc +; CI-NEXT: v_mov_b32_e32 v14, 0 +; CI-NEXT: v_mov_b32_e32 v17, s17 +; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] +; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[12:13] +; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 +; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 +; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[16:17] +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v14, 0 +; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] +; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[14:15] +; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc +; CI-NEXT: v_mov_b32_e32 v16, 0 +; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, <8 x double> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir new file mode 100644 index 0000000000000..08e6f1a067ac5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir @@ -0,0 +1,246 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-lower-control-flow -o - %s | FileCheck %s + +# Test si-lower-control-flow insertion points when other terminator +# instructions are present besides the control flow pseudo and a +# branch. + + +# There's another terminator instruction between SI_IF and +# S_BRANCH. The S_CBRANCH_EXECZ should be inserted immediately before +# S_BRANCH. +--- +name: other_terminator_sbranch_after_si_if +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: other_terminator_sbranch_after_si_if + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc + ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0, implicit [[S_MOV_B64_term]] + bb.0: + successors: %bb.2, %bb.1 + liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:sreg_64_xexec = COPY $sgpr4_sgpr5 + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec + %3:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0, implicit %4 + +... + +# S_CBRANCH_EXECZ should be inserted after the other terminator +--- +name: other_terminator_fallthrough_after_si_if +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: other_terminator_fallthrough_after_si_if + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc + ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0, implicit [[S_MOV_B64_term]] + bb.0: + successors: %bb.2, %bb.1 + liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:sreg_64_xexec = COPY $sgpr4_sgpr5 + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec + %3:sreg_64_xexec = SI_IF %2, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec + %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0, implicit %4 + +... + +--- +name: other_terminator_sbranch_after_si_else +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: other_terminator_sbranch_after_si_else + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY %2 + ; CHECK: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_SAVEEXEC_B64 [[COPY]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY1]], implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY2]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0, implicit [[S_MOV_B64_term]] + bb.0: + successors: %bb.2, %bb.1 + liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:sreg_64_xexec = COPY $sgpr4_sgpr5 + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec + %3:sreg_64_xexec = SI_ELSE %2, %bb.1, 0, implicit-def $exec, implicit-def dead $scc, implicit $exec + %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0, implicit %4 + +... + +--- +name: other_terminator_sbranch_after_si_loop +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: other_terminator_sbranch_after_si_loop + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec + ; CHECK: $exec = S_ANDN2_B64_term $exec, [[V_CMP_EQ_U32_e64_]], implicit-def $scc + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[COPY1]], implicit $exec + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0, implicit [[S_MOV_B64_term]] + bb.0: + successors: %bb.2, %bb.1 + liveins: $vgpr0, $vgpr1, $sgpr4_sgpr5 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:sreg_64_xexec = COPY $sgpr4_sgpr5 + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec + SI_LOOP %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0, implicit %4 + +... + +# The save exec result register of SI_IF is used by other terminators +# inserted to behave as a lowered phi. The output register of SI_IF +# was ignored, and the def was removed, so the S_MOV_B64_term uses +# would fail the verifier. + +--- +name: si_if_use +alignment: 1 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: si_if_use + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[COPY]], killed [[COPY1]], implicit $exec + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc + ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_]] + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec + ; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] + ; CHECK: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]] + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY4]] + ; CHECK: $exec = S_OR_B64 $exec, killed [[COPY5]], implicit-def $scc + ; CHECK: S_SLEEP 1 + ; CHECK: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec + ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; CHECK: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY6]], implicit-def dead $scc + ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] + ; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec + ; CHECK: [[S_MOV_B64_term2:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + + %0:vgpr_32 = COPY killed $vgpr0 + %1:vgpr_32 = COPY killed $vgpr1 + %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec + %10:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %14:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec + %13:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec + S_BRANCH %bb.2 + + bb.1: + %11:sreg_64_xexec = COPY %13 + dead %6:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + %14:sreg_64_xexec = COPY %11 + + bb.2: + %12:sreg_64_xexec = COPY %14 + SI_END_CF killed %12, implicit-def $exec, implicit-def dead $scc, implicit $exec + S_SLEEP 1 + %9:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec + %14:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec + %13:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec + S_BRANCH %bb.2 + +... diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll index 6fe24c1dfb948..b414c83374b89 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll @@ -1,15 +1,23 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10,CACHE_INV10 %s + +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,SKIP_CACHE_INV %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10,SKIP_CACHE_INV %s + ; FUNC-LABEL: {{^}}system_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_acquire() { entry: @@ -34,7 +42,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_acq_rel() { entry: @@ -47,7 +59,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_seq_cst() { entry: @@ -60,7 +76,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_one_as_acquire() { entry: @@ -85,7 +105,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_one_as_acq_rel() { entry: @@ -98,7 +122,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @system_one_as_seq_cst() { entry: @@ -191,7 +219,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_acquire() { entry: @@ -216,7 +248,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_acq_rel() { entry: @@ -229,7 +265,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_seq_cst() { entry: @@ -242,7 +282,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_one_as_acquire() { entry: @@ -267,7 +311,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_one_as_acq_rel() { entry: @@ -280,7 +328,11 @@ entry: ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: buffer_wbinvl1{{$}} +; CACHE_INV: buffer_wbinvl1{{$}} +; CACHE_INV10: buffer_gl0_inv +; CACHE_INV10: buffer_gl1_inv +; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} +; SKIP_CACHE_INV-NOT: buffer_gl ; GCN: s_endpgm define amdgpu_kernel void @agent_one_as_seq_cst() { entry: diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll index fea47db60d9ea..73f4f9e0cfc06 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll @@ -27,11 +27,10 @@ main_body: %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1 %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2 %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3 - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 0, i32 %arg, i32 0, i32 14, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 78, i32 3) #2 ret void } -; Function Attrs: nounwind -declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0 -attributes #0 = { nounwind } +attributes #0 = { nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll index 958692e0c92b1..e363d039548b0 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll @@ -11,13 +11,14 @@ define amdgpu_vs void @test1(i32 %v) #0 { store i32 %v, i32 addrspace(3)* %p0 - call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 68, i32 1) %w = load i32, i32 addrspace(3)* %p0 store i32 %w, i32 addrspace(3)* %p1 ret void } -declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #1 attributes #0 = { nounwind } +attributes #1 = { nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll index cb45d1172e6c5..925a2daa93da7 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -566,7 +566,6 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: ; GCN: buffer_store_dwordx4 ; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dword v ; CI: buffer_store_dwordx3 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll index b28e8fa72ec77..90da8406c4b78 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll @@ -1,7 +1,5 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s -declare i32 @llvm.amdgcn.workitem.id.x() readnone - ;;;==========================================================================;;; ;;; MUBUF LOAD TESTS ;;;==========================================================================;;; @@ -60,10 +58,10 @@ main_body: %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1) %tmp4 = add i32 %6, 16 %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3) ret void } @@ -79,10 +77,10 @@ main_body: %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1) %tmp4 = add i32 %6, 16 %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3) ret void } @@ -136,14 +134,14 @@ entry: ; CHECK-LABEL: {{^}}store_sgpr_ptr: ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 -define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) { store i32 99, i32 addrspace(1)* %out, align 4 ret void } ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset: ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40 -define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) { %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10 store i32 99, i32 addrspace(1)* %out.gep, align 4 ret void @@ -152,7 +150,7 @@ define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset: ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) { %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 store i32 99, i32 addrspace(1)* %out.gep, align 4 ret void @@ -161,7 +159,7 @@ define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) # ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic: ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) { %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst ret void @@ -169,14 +167,20 @@ define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* ; CHECK-LABEL: {{^}}store_vgpr_ptr: ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) { %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid store i32 99, i32 addrspace(1)* %out.gep, align 4 ret void } -declare i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32>, i32, i32, i32) #0 -declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #3 attributes #0 = { nounwind readonly } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nounwind willreturn writeonly } +attributes #3 = { nounwind readonly willreturn } +attributes #4 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-copies-extra-insts-after-copy.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-copies-extra-insts-after-copy.mir new file mode 100644 index 0000000000000..b0e67034a4031 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-copies-extra-insts-after-copy.mir @@ -0,0 +1,51 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-optimize-exec-masking -o - %s | FileCheck %s + +# Make sure we can still optimize writes to exec when there are +# additional terminators after the exec write. This can happen with +# phi users of control flow intrinsics. + +--- +name: instructions_after_copy_to_exec +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: instructions_after_copy_to_exec + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $vgpr0 + ; CHECK: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: renamable $vcc = V_CMP_EQ_U32_e64 0, killed $vgpr0, implicit $exec + ; CHECK: $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: renamable $sgpr0_sgpr1 = S_XOR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; CHECK: renamable $sgpr0_sgpr1 = COPY killed renamable $sgpr0_sgpr1, implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: S_NOP 0, implicit $sgpr0_sgpr1 + ; CHECK: bb.2: + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: S_NOP 0, implicit $sgpr0_sgpr1 + bb.0: + liveins: $vgpr0 + + renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec + renamable $vcc = V_CMP_EQ_U32_e64 0, killed $vgpr0, implicit $exec + renamable $sgpr0_sgpr1 = COPY $exec, implicit-def $exec + renamable $sgpr2_sgpr3 = S_AND_B64 renamable $sgpr0_sgpr1, renamable $vcc, implicit-def dead $scc + renamable $sgpr0_sgpr1 = S_XOR_B64 renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc + $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3 + renamable $sgpr0_sgpr1 = S_MOV_B64_term killed renamable $sgpr0_sgpr1, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + liveins: $sgpr0_sgpr1 + + S_NOP 0, implicit $sgpr0_sgpr1 + + bb.2: + liveins: $sgpr0_sgpr1 + + S_NOP 0, implicit $sgpr0_sgpr1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir new file mode 100644 index 0000000000000..0c62d666e7296 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir @@ -0,0 +1,201 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=si-optimize-exec-masking-pre-ra,si-optimize-exec-masking-pre-ra -o - %s | FileCheck %s + +# FIXME: Second run of the pass is a workaround for a bug in +# -run-pass. The verifier doesn't detect broken LiveIntervals, see bug +# 46873 + + +# Cannot fold this without moving the def of %7 after the and. +--- +name: no_fold_andn2_select_condition_live_out_phi +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: no_fold_andn2_select_condition_live_out_phi + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; CHECK: undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec + ; CHECK: V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec + ; CHECK: %1.sub1:vreg_64 = COPY %1.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + %7:sreg_64_xexec = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %7, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + %7:sreg_64_xexec = S_MOV_B64 0 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +# It's OK to fold this, since the phi def is after the andn2 insert point. +--- +name: fold_andn2_select_condition_live_out_phi_reorder +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fold_andn2_select_condition_live_out_phi_reorder + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 + ; CHECK: undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: %1.sub1:vreg_64 = COPY %1.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $vcc = S_ANDN2_B64 $exec, [[S_MOV_B64_]], implicit-def dead $scc + ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + %7:sreg_64_xexec = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %7, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + %7:sreg_64_xexec = S_MOV_B64 0 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +--- +name: no_fold_andn2_select_condition_live_out_phi_physreg +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: no_fold_andn2_select_condition_live_out_phi_physreg + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 -1 + ; CHECK: undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + ; CHECK: V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec + ; CHECK: %0.sub1:vreg_64 = COPY %0.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK: $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + $sgpr4_sgpr5 = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + liveins: $sgpr4_sgpr5 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $sgpr4_sgpr5 = S_MOV_B64 0 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... + +--- +name: fold_andn2_select_condition_live_out_phi_physreg_reorder +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fold_andn2_select_condition_live_out_phi_physreg_reorder + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 -1 + ; CHECK: undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK: %0.sub1:vreg_64 = COPY %0.sub0 + ; CHECK: DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store 8, addrspace 3) + ; CHECK: ATOMIC_FENCE 4, 2 + ; CHECK: $vcc = S_ANDN2_B64 $exec, $sgpr4_sgpr5, implicit-def dead $scc + ; CHECK: $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc + ; CHECK: S_BRANCH %bb.2 + bb.0: + successors: %bb.2 + + $sgpr4_sgpr5 = S_MOV_B64 -1 + undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM 0 + + bb.2: + successors: %bb.1, %bb.2 + liveins: $sgpr4_sgpr5 + + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec + V_CMP_NE_U32_e32 1, %4, implicit-def $vcc, implicit $exec + %5.sub1:vreg_64 = COPY %5.sub0 + DS_WRITE_B64_gfx9 undef %6:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) + ATOMIC_FENCE 4, 2 + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + $sgpr4_sgpr5 = S_MOV_B64 0 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_BRANCH %bb.2 + +... diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-strip-terminator-bits.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-strip-terminator-bits.mir new file mode 100644 index 0000000000000..3e939bd476041 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-strip-terminator-bits.mir @@ -0,0 +1,44 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-optimize-exec-masking -o - %s | FileCheck %s + +# There are multiple _term pseudos here. Starting from the end of the +# block, they all need to be converted to regular copies removed in +# order to avoid making the verifier unhappy. + +--- +name: multi_term_pseudos +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: multi_term_pseudos + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK: $exec = COPY killed renamable $sgpr4_sgpr5 + ; CHECK: renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr6_sgpr7, implicit $exec + ; CHECK: renamable $sgpr12_sgpr13 = COPY killed renamable $sgpr8_sgpr9, implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.1: + ; CHECK: liveins: $sgpr12_sgpr13 + ; CHECK: S_ENDPGM 0, implicit $sgpr12_sgpr13 + ; CHECK: bb.2: + ; CHECK: liveins: $sgpr12_sgpr13 + ; CHECK: S_ENDPGM 0, implicit $sgpr12_sgpr13 + bb.0: + successors: %bb.2(0x40000000), %bb.1(0x40000000) + liveins: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9 + $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5 + renamable $sgpr10_sgpr11 = S_MOV_B64_term killed renamable $sgpr6_sgpr7, implicit $exec + renamable $sgpr12_sgpr13 = S_MOV_B64_term killed renamable $sgpr8_sgpr9, implicit $exec + S_CBRANCH_EXECZ %bb.1, implicit $exec + S_BRANCH %bb.2 + + bb.1: + liveins: $sgpr12_sgpr13 + S_ENDPGM 0, implicit $sgpr12_sgpr13 + + bb.2: + liveins: $sgpr12_sgpr13 + S_ENDPGM 0, implicit $sgpr12_sgpr13 + +... diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll index 83a608ad5f31c..69fbac7b44a33 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -1,10 +1,12 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s -; This shows that the amount of LDS estimate is sensitive to the order -; of the LDS globals. +; This shows that the amount LDS size estimate should try to not be +; sensitive to the order of the LDS globals. This should try to +; estimate the worst case padding behavior to avoid overallocating +; LDS. -; Both of these functions use the same amount of LDS, but the total -; changes depending on the visit order of first use. +; These functions use the same amount of LDS, but the total, final +; size changes depending on the visit order of first use. ; The one with the suboptimal order resulting in extra padding exceeds ; the desired limit @@ -29,7 +31,7 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_0: -; GCN: workgroup_group_segment_byte_size = 2340 +; GCN: workgroup_group_segment_byte_size = 1060 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) @@ -61,7 +63,7 @@ entry: } ; GCN-LABEL: {{^}}promote_alloca_size_order_1: -; GCN: workgroup_group_segment_byte_size = 2352 +; GCN: workgroup_group_segment_byte_size = 1072 define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll index a6511910603e1..4e23875502e96 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -4,16 +4,20 @@ target datalayout = "A5" @all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4 +@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4 + +@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4 +@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4 ; This function cannot promote to using LDS because of the size of the ; constant expression use in the function, which was previously not ; detected. -; IR-LABEL: @constant_expression_uses_lds( +; IR-LABEL: @constant_expression_uses_all_lds( ; IR: alloca -; ASM-LABEL: constant_expression_uses_lds: -; ASM: .group_segment_fixed_size: 65536 -define amdgpu_kernel void @constant_expression_uses_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +; ASM-LABEL: constant_expression_uses_all_lds: +; ASM: .amdhsa_group_segment_fixed_size 65536 +define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 @@ -32,4 +36,130 @@ entry: ret void } -attributes #0 = { "amdgpu-waves-per-eu"="1,5" } +; Has a constant expression use through a single level of constant +; expression, but not enough LDS to block promotion + +; IR-LABEL: @constant_expression_uses_some_lds( +; IR-NOT: alloca + +; ASM-LABEL: {{^}}constant_expression_uses_some_lds: +; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} +define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef + ret void +} + +declare void @callee(i8*) + +; IR-LABEL: @constant_expression_uses_all_lds_multi_level( +; IR: alloca + +; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level: +; ASM: .amdhsa_group_segment_fixed_size 65536{{$}} +define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*)) + ret void +} + +; IR-LABEL: @constant_expression_uses_some_lds_multi_level( +; IR-NOT: alloca +; IR: llvm.amdgcn.workitem.id + +; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level: +; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} +define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*)) + ret void +} + +; IR-LABEL: @constant_expression_uses_some_lds_global_initializer( +; IR-NOT: alloca +; IR: llvm.amdgcn.workitem.id + +; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer: +; ASM: .amdhsa_group_segment_fixed_size 4096{{$}} +define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + + store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef + ret void +} + +; We can't actually handle LDS initializers in global initializers, +; but this should count as usage. + +; IR-LABEL: @constant_expression_uses_all_lds_global_initializer( +; IR: alloca + +; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer: +; ASM: .group_segment_fixed_size: 65536 +define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll index 2fecbcb9d7a6b..bd37356b15ecb 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll @@ -129,7 +129,7 @@ endif: ; } ; FIXME: This should be promotable. We need to use -; GetUnderlyingObjects when looking at the icmp user. +; getUnderlyingObjects when looking at the icmp user. ; CHECK-LABEL: @ptr_induction_var_same_alloca( ; CHECK: %alloca = alloca [64 x i32], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index a9a60b93ef54d..50df07e0a8f43 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -17,9 +17,9 @@ define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 @@ -86,14 +86,14 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -300,9 +300,9 @@ define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -456,10 +456,10 @@ define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir new file mode 100644 index 0000000000000..49d6a9ad19717 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign-wave64.mir @@ -0,0 +1,69 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s + + +# Test that subreg reassignments are correctly handled when whole register also +# conflicts. If this is mishandled stall counts will be incorrect and cause an +# infinite loop. +# GCN-LABEL: vgpr64_mixed_use{{$}} +# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF +# GCN: $vgpr4_vgpr5 = IMPLICIT_DEF +# GCN: $vcc = IMPLICIT_DEF +# GCN: $vgpr2_vgpr3 = IMPLICIT_DEF +# GCN: $vgpr6_vgpr7 = IMPLICIT_DEF +# GCN: $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF +# GCN: $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF +# GCN: $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF +# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF +# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF +# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF +# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF +# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF +# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF +# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF +# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec +# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec +# GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 $vgpr4_vgpr5, $vgpr0_vgpr1, implicit $exec +--- +name: vgpr64_mixed_use +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } + - { id: 1, class: vreg_64, preferred-register: '$vgpr4_vgpr5' } + - { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64_xexec } + - { id: 6, class: vreg_64, preferred-register: '$vgpr2_vgpr3' } + - { id: 7, class: vreg_64, preferred-register: '$vgpr6_vgpr7' } + - { id: 8, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' } + - { id: 9, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' } + - { id: 10, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' } + - { id: 11, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } + - { id: 12, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } + - { id: 13, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } + - { id: 14, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } + - { id: 15, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } + - { id: 16, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } + - { id: 17, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %6 = IMPLICIT_DEF + %7 = IMPLICIT_DEF + %8 = IMPLICIT_DEF + %9 = IMPLICIT_DEF + %10 = IMPLICIT_DEF + %11 = IMPLICIT_DEF + %12 = IMPLICIT_DEF + %13 = IMPLICIT_DEF + %14 = IMPLICIT_DEF + %15 = IMPLICIT_DEF + %16 = IMPLICIT_DEF + %17 = IMPLICIT_DEF + %3 = V_CNDMASK_B32_e64 0, %0.sub1, 0, %1.sub1, %2, implicit $exec + %4 = V_CNDMASK_B32_e64 0, %0.sub0, 0, %1.sub0, %2, implicit $exec + %5 = V_CMP_LT_U64_e64 %1, %0, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir index 0020e17a0b6fe..2078d8c22922b 100644 --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -494,3 +494,81 @@ body: | %2 = V_AND_B32_e32 %1, %0, implicit $exec S_ENDPGM 0 ... + +# Test that bank of subreg is considered during scavenging. +# If handled incorrectly an infinite loop occurs. +# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}} +# GCN: S_AND_B32 renamable $sgpr13, $sgpr0, +--- +name: s0_vs_s15_16_17_sub1 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' } + - { id: 1, class: sgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + %1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc + S_ENDPGM 0 +... + +# Test that the size of subreg is correctly handled in bank calculation. +# If handled incorrectly an infinite loop occurs. +# GCN-LABEL: vgpr_sub_dependence{{$}} +# GCN: $vgpr9_vgpr10_vgpr11_vgpr12 = IMPLICIT_DEF +# GCN: $vgpr16_vgpr17 = IMPLICIT_DEF +# GCN: $vgpr14_vgpr15 = IMPLICIT_DEF +# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF +# GCN: $vgpr7_vgpr8 = IMPLICIT_DEF +# GCN: $vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF +# GCN: $vgpr18_vgpr19 = IMPLICIT_DEF +# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF +# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF +# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF +# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF +# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF +# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF +# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF +# GCN: $vgpr0_vgpr1 = V_ADD_F64 0, $vgpr11_vgpr12, 0, killed $vgpr16_vgpr17, 0, 0, implicit $mode, implicit $exec +# GCN: $vgpr0_vgpr1 = V_ADD_F64 0, $vgpr9_vgpr10, 0, killed $vgpr14_vgpr15, 0, 0, implicit $mode, implicit $exec +--- +name: vgpr_sub_dependence +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_128, preferred-register: '$vgpr10_vgpr11_vgpr12_vgpr13' } + - { id: 1, class: vreg_64, preferred-register: '$vgpr16_vgpr17' } + - { id: 2, class: vreg_64, preferred-register: '$vgpr14_vgpr15' } + - { id: 3, class: vreg_64 } + - { id: 4, class: vreg_64 } + - { id: 5, class: vreg_64, preferred-register: '$vgpr0_vgpr1' } + - { id: 6, class: vreg_64, preferred-register: '$vgpr7_vgpr8' } + - { id: 7, class: vreg_128, preferred-register: '$vgpr3_vgpr4_vgpr5_vgpr6' } + - { id: 8, class: vreg_64, preferred-register: '$vgpr18_vgpr19' } + - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' } + - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' } + - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' } + - { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' } + - { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' } + - { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' } + - { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' } +body: | + bb.0: + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %5 = IMPLICIT_DEF + %6 = IMPLICIT_DEF + %7 = IMPLICIT_DEF + %8 = IMPLICIT_DEF + %9 = IMPLICIT_DEF + %10 = IMPLICIT_DEF + %11 = IMPLICIT_DEF + %12 = IMPLICIT_DEF + %13 = IMPLICIT_DEF + %14 = IMPLICIT_DEF + %15 = IMPLICIT_DEF + %3 = V_ADD_F64 0, %0.sub2_sub3:vreg_128, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec + %4 = V_ADD_F64 0, %0.sub0_sub1:vreg_128, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll index 9df99aae15d49..73d837efa9f44 100644 --- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll @@ -5,17 +5,21 @@ define void @child_function() #0 { ret void } -; GCN-LABEL: {{^}}parent_func: -; CHECK: v_writelane_b32 v255, s33, 2 -; CHECK: v_writelane_b32 v255, s30, 0 -; CHECK: v_writelane_b32 v255, s31, 1 -; CHECK: s_swappc_b64 s[30:31], s[4:5] -; CHECK: v_readlane_b32 s4, v255, 0 -; CHECK: v_readlane_b32 s5, v255, 1 -; CHECK: v_readlane_b32 s33, v255, 2 +; GCN-LABEL: {{^}}reserve_vgpr_with_no_lower_vgpr_available: +; GCN: buffer_store_dword v255, off, s[0:3], s32 +; GCN: v_writelane_b32 v255, s33, 2 +; GCN: v_writelane_b32 v255, s30, 0 +; GCN: v_writelane_b32 v255, s31, 1 +; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: v_readlane_b32 s4, v255, 0 +; GCN: v_readlane_b32 s5, v255, 1 +; GCN: v_readlane_b32 s33, v255, 2 ; GCN: ; NumVgprs: 256 -define void @parent_func() #0 { +define void @reserve_vgpr_with_no_lower_vgpr_available() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} @@ -47,4 +51,140 @@ define void @parent_func() #0 { ret void } +; GCN-LABEL: {{^}}reserve_lowest_available_vgpr: +; GCN: buffer_store_dword v254, off, s[0:3], s32 +; GCN: v_writelane_b32 v254, s33, 2 +; GCN: v_writelane_b32 v254, s30, 0 +; GCN: v_writelane_b32 v254, s31, 1 +; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: v_readlane_b32 s4, v254, 0 +; GCN: v_readlane_b32 s5, v254, 1 +; GCN: v_readlane_b32 s33, v254, 2 + +define void @reserve_lowest_available_vgpr() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253}" () #0 + call void @child_function() + ret void +} + +; GCN-LABEL: {{^}}reserve_vgpr_with_sgpr_spills: +; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 +; GCN: ; def s4 +; GCN: v_writelane_b32 v254, s4, 2 +; GCN: v_readlane_b32 s4, v254, 2 +; GCN: ; use s4 + +define void @reserve_vgpr_with_sgpr_spills() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253}" () #0 + + %sgpr = call i32 asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 undef, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(i32 %sgpr) #0 + br label %ret + +ret: + ret void +} + +; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call +; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 +; GCN-NOT: v_writelane +; GCN: s_setpc_b64 s[4:5] + +define void @reserve_vgpr_with_tail_call() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253},~{v254}" () #0 + musttail call void @child_function() + ret void +} + attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index fe9e6275e0d58..296e8e770827e 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -136,17 +136,16 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0 -; GFX9-NEXT: s_add_i32 s1, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_i32 v4, s0, v4 clamp +; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc -; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v4 +; GFX9-NEXT: global_store_dword v[0:1], v5, off ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v[2:3], v0, off ; GFX9-NEXT: s_endpgm %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -227,12 +226,11 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v6, v4, v5 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: global_store_dword v[0:1], v6, off -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: v_add_i32 v6, v4, v5 clamp +; GFX9-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v6 +; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v[2:3], v0, off ; GFX9-NEXT: s_endpgm %a = load i32, i32 addrspace(1)* %aptr, align 4 @@ -495,17 +493,15 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v9, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v8, v0, v2 -; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off +; GFX9-NEXT: v_add_i32 v8, v0, v2 clamp +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_add_i32 v2, v1, v3 clamp +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll new file mode 100644 index 0000000000000..e78889e349dc8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -0,0 +1,439 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { +; GFX6-LABEL: v_saddsat_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_min_i16_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_max_i16_e32 v0, 0xff80, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) + ret i8 %result +} + +define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { +; GFX6-LABEL: v_saddsat_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 +; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %result +} + +define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { +; GFX6-LABEL: v_saddsat_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 +; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v0, 1 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %result +} + +define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { +; GFX6-LABEL: v_saddsat_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_add_u16_e32 v4, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fff +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v5, v6, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 +; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) + ret <2 x i16> %result +} + +define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { +; GFX6-LABEL: v_saddsat_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 +; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v8, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp +; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) + ret <3 x i16> %result +} + +define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; GFX6-LABEL: v_saddsat_v4i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_add_u16_e32 v5, v4, v2 +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 +; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v8, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %cast = bitcast <4 x i16> %result to <2 x float> + ret <2 x float> %cast +} + +define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; GFX6-LABEL: v_saddsat_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 +; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v5, -2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 +; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %result +} + +define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_saddsat_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_saddsat_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_saddsat_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %result +} + +declare i8 @llvm.sadd.sat.i8(i8, i8) #0 +declare i16 @llvm.sadd.sat.i16(i16, i16) #0 +declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0 +declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0 +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0 +declare i32 @llvm.sadd.sat.i32(i32, i32) #0 +declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0 +declare i64 @llvm.sadd.sat.i64(i64, i64) #0 diff --git a/llvm/test/CodeGen/AMDGPU/sched-prefer-non-mfma.mir b/llvm/test/CodeGen/AMDGPU/sched-prefer-non-mfma.mir new file mode 100644 index 0000000000000..35d18c0c4c16d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched-prefer-non-mfma.mir @@ -0,0 +1,18 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass post-RA-sched -amdgpu-disable-power-sched -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: unrelated_mfma +# GCN: V_MFMA_F32_32X32X1F32 +# GCN: S_CMP_LG_U32 +# GCN: V_MFMA_F32_32X32X1F32 + +--- +name: unrelated_mfma +body: | + bb.0.entry: + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32 $vgpr67, $vgpr66, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec + renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 = V_MFMA_F32_32X32X1F32 $vgpr69, $vgpr68, killed $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63, 0, 0, 0, implicit $mode, implicit $exec + renamable $sgpr2 = S_ADD_U32 renamable $sgpr2, 4, implicit-def $scc + renamable $sgpr3 = S_ADDC_U32 renamable $sgpr3, 0, implicit-def dead $scc, implicit killed $scc + S_CMP_LG_U32 renamable $sgpr2, 64, implicit-def $scc + +... diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll index be76371c42df3..8aa48d4d24caa 100644 --- a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -25,31 +25,31 @@ main_body: %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0) - call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 36, i32 %arg, i32 68, i32 3) %bc = bitcast <4 x float> %array_vector3 to <4 x i32> %tmp4 = extractelement <4 x i32> %bc, i32 undef - call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 48, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 48, i32 %arg, i32 68, i32 3) %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32> %tmp5 = extractelement <4 x i32> %bc49, i32 undef - call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 72, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 72, i32 %arg, i32 68, i32 3) %array_vector21 = insertelement <4 x float> , float %tmp, i32 1 %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2 %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3 - call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 28, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 28, i32 %arg, i32 68, i32 3) %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32> %tmp6 = extractelement <4 x i32> %bc52, i32 undef - call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 64, i32 4, i32 4, i1 1, i1 1) - call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 20, i32 4, i32 4, i1 1, i1 1) - call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 56, i32 4, i32 4, i1 1, i1 1) - call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 92, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 64, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 20, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 56, i32 %arg, i32 68, i32 3) + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 92, i32 %arg, i32 68, i32 3) ret void } -declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2 -declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #1 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #2 +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #3 attributes #0 = { nounwind "target-cpu"="tonga" } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind readonly } -attributes #3 = { nounwind } +attributes #1 = { nounwind readnone willreturn } +attributes #2 = { nounwind readonly willreturn } +attributes #3 = { nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll index fe7a350a1d9f3..733399dd09096 100644 --- a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.000000e+00 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 %4 = bitcast i32 %3 to float %5 = bitcast float %4 to i32 @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 %4 = bitcast i32 %3 to float %5 = bitcast float %4 to i32 diff --git a/llvm/test/CodeGen/AMDGPU/set-dx10.ll b/llvm/test/CodeGen/AMDGPU/set-dx10.ll index 6867c6394937c..9a317a87540f4 100644 --- a/llvm/test/CodeGen/AMDGPU/set-dx10.ll +++ b/llvm/test/CodeGen/AMDGPU/set-dx10.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float entry: %0 = fcmp une float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -38,7 +38,7 @@ define amdgpu_kernel void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float entry: %0 = fcmp oeq float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -64,7 +64,7 @@ define amdgpu_kernel void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float entry: %0 = fcmp ogt float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -90,7 +90,7 @@ define amdgpu_kernel void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float entry: %0 = fcmp oge float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -116,7 +116,7 @@ define amdgpu_kernel void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float entry: %0 = fcmp ole float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -142,7 +142,7 @@ define amdgpu_kernel void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float entry: %0 = fcmp olt float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir new file mode 100644 index 0000000000000..777a692affca0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir @@ -0,0 +1,52 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s +# +# Check for liveness errors when spilling partially defined super registers. + +--- +name: sgpr_spill_s64_undef_high32 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + +stack: + - { id: 0, type: spill-slot, size: 8, alignment: 4, stack-id: sgpr-spill } + +body: | + bb.0: + liveins: $sgpr4 + + ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32 + ; CHECK: liveins: $sgpr4, $vgpr0 + ; CHECK: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 $sgpr4, 0, undef $vgpr0, implicit-def $sgpr4_sgpr5 + ; CHECK: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 $sgpr5, 1, $vgpr0 + SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5) + +... + +--- +name: sgpr_spill_s64_undef_low32 +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + hasSpilledSGPRs: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + +stack: + - { id: 0, type: spill-slot, size: 8, alignment: 4, stack-id: sgpr-spill } + +body: | + bb.0: + liveins: $sgpr5 + + ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32 + ; CHECK: liveins: $sgpr5, $vgpr0 + ; CHECK: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 $sgpr4, 0, undef $vgpr0, implicit-def $sgpr4_sgpr5 + ; CHECK: $vgpr0 = V_WRITELANE_B32_gfx6_gfx7 $sgpr5, 1, $vgpr0 + SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5) + +... diff --git a/llvm/test/CodeGen/AMDGPU/si-if-lower-user-terminators.mir b/llvm/test/CodeGen/AMDGPU/si-if-lower-user-terminators.mir deleted file mode 100644 index 5850a3b27bce8..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/si-if-lower-user-terminators.mir +++ /dev/null @@ -1,75 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-lower-control-flow -verify-machineinstrs -o - %s | FileCheck %s - -# The save exec result register of SI_IF is used by other terminators -# inserted to behave as a lowered phi. The output register of SI_IF -# was ignored, and the def was removed, so the S_MOV_B64_term uses -# would fail the verifier. - ---- -name: si_if_use -alignment: 1 -legalized: true -regBankSelected: true -selected: true -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: si_if_use - ; CHECK: bb.0: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 killed [[COPY]], killed [[COPY1]], implicit $exec - ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY2]], implicit-def dead $scc - ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_]] - ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec - ; CHECK: S_BRANCH %bb.2 - ; CHECK: bb.1: - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] - ; CHECK: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) - ; CHECK: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]] - ; CHECK: bb.2: - ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY4]] - ; CHECK: $exec = S_OR_B64 $exec, killed [[COPY5]], implicit-def $scc - ; CHECK: S_SLEEP 1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc - ; CHECK: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY6]], implicit-def dead $scc - ; CHECK: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; CHECK: S_CBRANCH_EXECZ %bb.1, implicit $exec - ; CHECK: [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec - ; CHECK: [[S_MOV_B64_term2:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec - ; CHECK: S_BRANCH %bb.2 - bb.0: - liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 - - %0:vgpr_32 = COPY killed $vgpr0 - %1:vgpr_32 = COPY killed $vgpr1 - %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec - %10:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %14:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec - %13:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec - S_BRANCH %bb.2 - - bb.1: - %11:sreg_64_xexec = COPY %13 - dead %6:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) - %14:sreg_64_xexec = COPY %11 - - bb.2: - %12:sreg_64_xexec = COPY %14 - SI_END_CF killed %12, implicit-def $exec, implicit-def dead $scc, implicit $exec - S_SLEEP 1 - %9:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %14:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec - %13:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec - S_BRANCH %bb.2 - -... diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index ec2dbb995c98c..225d7cc0d9a2f 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,12 +1,6 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare void @llvm.amdgcn.s.barrier() #1 -declare i32 @llvm.amdgcn.workitem.id.x() #2 - - @stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 @stored_constant_ptr = addrspace(3) global i32 addrspace(4)* undef, align 8 @stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 @@ -296,30 +290,33 @@ define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace( ret void } -; XGCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load: -; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4 -; XCI: TBUFFER_STORE_FORMAT -; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8 -; define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 { -; %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 +; GCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load: +; GCN: tbuffer_store_format +; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:2 +define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 { + %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 -; %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 -; %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 -; %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 -; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 -; call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef, -; i32 %vaddr, i32 0, i32 0, i32 32, i32 14, i32 4, i1 1, i1 1) + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + %vaddr.add = add i32 %vaddr, 32 + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef, i32 %vaddr.add, i32 0, i32 0, i32 228, i32 3) -; %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 -; %add = add nsw i32 %tmp1, %tmp2 + %add = add nsw i32 %tmp1, %tmp2 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} -; store i32 %add, i32 addrspace(1)* %out, align 4 -; ret void -; } +declare void @llvm.amdgcn.s.barrier() #1 +declare i32 @llvm.amdgcn.workitem.id.x() #2 +declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #3 attributes #0 = { nounwind } -attributes #1 = { nounwind convergent } -attributes #2 = { nounwind readnone } +attributes #1 = { convergent nounwind willreturn } +attributes #2 = { nounwind readnone speculatable willreturn } +attributes #3 = { nounwind willreturn writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll new file mode 100644 index 0000000000000..138488ece9979 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -0,0 +1,1004 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { +; GFX6-LABEL: v_ssubsat_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_min_i16_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_max_i16_e32 v0, 0xff80, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) + ret i8 %result +} + +define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { +; GFX6-LABEL: v_ssubsat_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %result +} + +define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { +; GFX6-LABEL: v_ssubsat_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 +; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v0, v1 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v0, 1 +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v0, v1 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %result +} + +define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { +; GFX6-LABEL: v_ssubsat_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_sub_u16_e32 v4, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fff +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v5, v6, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) + ret <2 x i16> %result +} + +define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { +; GFX6-LABEL: v_ssubsat_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 +; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v8, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) + ret <3 x i16> %result +} + +define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; GFX6-LABEL: v_ssubsat_v4i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; GFX8-NEXT: v_mov_b32_e32 v8, 0x7fff +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_sub_u16_e32 v5, v4, v2 +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 +; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[6:7], 0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v8, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %cast = bitcast <4 x i16> %result to <2 x float> + ret <2 x float> %cast +} + +define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; GFX6-LABEL: v_ssubsat_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 +; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX6-NEXT: v_bfrev_b32_e32 v5, -2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v4, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v5, -2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %result +} + +define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { +; GFX6-LABEL: v_ssubsat_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 +; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX6-NEXT: v_bfrev_b32_e32 v7, -2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v7, -2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v7, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_i32 v0, v0, v3 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v4 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v5 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) + ret <3 x i32> %result +} + +define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; GFX6-LABEL: v_ssubsat_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 +; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v8, v9, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, v9, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_i32 v0, v0, v4 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v5 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v6 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v7 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + ret <4 x i32> %result +} + +define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { +; GFX6-LABEL: v_ssubsat_v8i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8 +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 +; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, v17, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v16, v17, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, v17, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v16, v17, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v16, v17, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v5, v16, v17, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v16, v17, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v7, v16, v17, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_v8i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8 +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v17, -2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v17, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v16, v17, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, v17, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v16, v17, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v16, v17, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v16, v17, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v16, v17, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v16, v17, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_i32 v0, v0, v8 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v9 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v10 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v11 clamp +; GFX9-NEXT: v_sub_i32 v4, v4, v12 clamp +; GFX9-NEXT: v_sub_i32 v5, v5, v13 clamp +; GFX9-NEXT: v_sub_i32 v6, v6, v14 clamp +; GFX9-NEXT: v_sub_i32 v7, v7, v15 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) + ret <8 x i32> %result +} + +define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { +; GFX6-LABEL: v_ssubsat_v16i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v32, 1 +; GFX6-NEXT: v_bfrev_b32_e32 v33, -2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v5, v21 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v5, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v6, v22 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v7, v23 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v7, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v24 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v8, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v9, v25 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v9, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v26 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v10, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v11, v27 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v11, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v12, v28 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v12, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v13, v29 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v13, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v30 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v14, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v31 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 +; GFX6-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v15, v32, v33, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_v16i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 +; GFX8-NEXT: v_bfrev_b32_e32 v32, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v33, -2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v5, v21 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v6, v22 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v7, v23 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v8, v24 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v9, v25 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v10, v26 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v10, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v11, v27 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v12, v28 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v13, v29 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v13, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v14, v30 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v31 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v32, v33, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_i32 v0, v0, v16 clamp +; GFX9-NEXT: v_sub_i32 v1, v1, v17 clamp +; GFX9-NEXT: v_sub_i32 v2, v2, v18 clamp +; GFX9-NEXT: v_sub_i32 v3, v3, v19 clamp +; GFX9-NEXT: v_sub_i32 v4, v4, v20 clamp +; GFX9-NEXT: v_sub_i32 v5, v5, v21 clamp +; GFX9-NEXT: v_sub_i32 v6, v6, v22 clamp +; GFX9-NEXT: v_sub_i32 v7, v7, v23 clamp +; GFX9-NEXT: v_sub_i32 v8, v8, v24 clamp +; GFX9-NEXT: v_sub_i32 v9, v9, v25 clamp +; GFX9-NEXT: v_sub_i32 v10, v10, v26 clamp +; GFX9-NEXT: v_sub_i32 v11, v11, v27 clamp +; GFX9-NEXT: v_sub_i32 v12, v12, v28 clamp +; GFX9-NEXT: v_sub_i32 v13, v13, v29 clamp +; GFX9-NEXT: v_sub_i32 v14, v14, v30 clamp +; GFX9-NEXT: v_sub_i32 v15, v15, v31 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) + ret <16 x i32> %result +} + + +define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_ssubsat_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX6-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_ssubsat_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_ssubsat_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] +; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %result +} + +declare i8 @llvm.ssub.sat.i8(i8, i8) #0 +declare i16 @llvm.ssub.sat.i16(i16, i16) #0 +declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) #0 +declare <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16>, <3 x i16>) #0 +declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) #0 +declare i32 @llvm.ssub.sat.i32(i32, i32) #0 +declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) #0 +declare <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32>, <3 x i32>) #0 +declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) #0 +declare <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32>, <8 x i32>) #0 +declare <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32>, <16 x i32>) #0 +declare i64 @llvm.ssub.sat.i64(i64, i64) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 2b8eba5f90149..a4d08ec980f44 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -6,14 +6,14 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 -; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: @@ -30,70 +30,70 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v3, s1 -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b32 v1, v3 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s2 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b32 v1, v3 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: ds_write_b32 v0, v3 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b32 v0, v1 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: ds_write_b32 v0, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b32 v0, v1 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v2, s1 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: ds_write_b32 v0, v2 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: @@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v0, s3 -; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b64 v2, v[0:1] -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s3 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v0, s3 -; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b64 v2, v[0:1] -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v3, s3 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: @@ -218,22 +218,22 @@ define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 { define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll new file mode 100644 index 0000000000000..1648c7fe37ccb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFILD %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -combiner-tokenfactor-inline-limit=7 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-TFIL7 %s + + +; GCN-LABEL: {{^}}token_factor_inline_limit_test: + +; GCN-TFILD: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}} +; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28 + +; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*$}} + +; GCN: v_mov_b32_e32 v31, 7 +; GCN: s_getpc +define void @token_factor_inline_limit_test() { +entry: + call void @external_void_func_8xv5i32( + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>) + ret void +} + +declare hidden void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>, + <5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll new file mode 100644 index 0000000000000..56e4123e182a0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -0,0 +1,522 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { +; GFX6-LABEL: v_uaddsat_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) + ret i8 %result +} + +define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { +; GFX6-LABEL: v_uaddsat_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %result +} + +define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) { +; GFX6-LABEL: v_uaddsat_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_not_b32_e32 v2, v1 +; GFX6-NEXT: v_min_u32_e32 v0, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %result +} + +define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { +; GFX6-LABEL: v_uaddsat_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) + ret <2 x i16> %result +} + +define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { +; GFX6-LABEL: v_uaddsat_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_min_u32_e32 v3, s4, v2 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v2 clamp +; GFX8-NEXT: v_add_u16_e64 v1, v1, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 clamp +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) + ret <3 x i16> %result +} + +define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; GFX6-LABEL: v_uaddsat_v4i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v2 clamp +; GFX8-NEXT: v_add_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v1, v1, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %cast = bitcast <4 x i16> %result to <2 x float> + ret <2 x float> %cast +} + +define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; GFX6-LABEL: v_uaddsat_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_not_b32_e32 v4, v2 +; GFX6-NEXT: v_min_u32_e32 v0, v0, v4 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v3 +; GFX6-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v3 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e64 v0, v0, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %result +} + +define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { +; GFX6-LABEL: v_uaddsat_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_min_u32_e32 v0, v0, v6 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_not_b32_e32 v3, v4 +; GFX6-NEXT: v_min_u32_e32 v1, v1, v3 +; GFX6-NEXT: v_not_b32_e32 v3, v5 +; GFX6-NEXT: v_min_u32_e32 v2, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v5 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e64 v0, v0, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v5 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) + ret <3 x i32> %result +} + +define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; GFX6-LABEL: v_uaddsat_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_not_b32_e32 v8, v4 +; GFX6-NEXT: v_min_u32_e32 v0, v0, v8 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v5 +; GFX6-NEXT: v_min_u32_e32 v1, v1, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v6 +; GFX6-NEXT: v_min_u32_e32 v2, v2, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v7 +; GFX6-NEXT: v_min_u32_e32 v3, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v7 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e64 v0, v0, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v7 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + ret <4 x i32> %result +} + +define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { +; GFX6-LABEL: v_uaddsat_v8i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_not_b32_e32 v16, v8 +; GFX6-NEXT: v_min_u32_e32 v0, v0, v16 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v9 +; GFX6-NEXT: v_min_u32_e32 v1, v1, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v10 +; GFX6-NEXT: v_min_u32_e32 v2, v2, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v11 +; GFX6-NEXT: v_min_u32_e32 v3, v3, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v12 +; GFX6-NEXT: v_min_u32_e32 v4, v4, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v13 +; GFX6-NEXT: v_min_u32_e32 v5, v5, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v14 +; GFX6-NEXT: v_min_u32_e32 v6, v6, v8 +; GFX6-NEXT: v_not_b32_e32 v8, v15 +; GFX6-NEXT: v_min_u32_e32 v7, v7, v8 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_v8i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v8 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v9 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v10 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v11 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v12 clamp +; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v13 clamp +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v6, v14 clamp +; GFX8-NEXT: v_add_u32_e64 v7, s[4:5], v7, v15 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e64 v0, v0, v8 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v9 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v10 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v11 clamp +; GFX9-NEXT: v_add_u32_e64 v4, v4, v12 clamp +; GFX9-NEXT: v_add_u32_e64 v5, v5, v13 clamp +; GFX9-NEXT: v_add_u32_e64 v6, v6, v14 clamp +; GFX9-NEXT: v_add_u32_e64 v7, v7, v15 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) + ret <8 x i32> %result +} + +define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { +; GFX6-LABEL: v_uaddsat_v16i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_not_b32_e32 v32, v16 +; GFX6-NEXT: v_min_u32_e32 v0, v0, v32 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v17 +; GFX6-NEXT: v_min_u32_e32 v1, v1, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v18 +; GFX6-NEXT: v_min_u32_e32 v2, v2, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v19 +; GFX6-NEXT: v_min_u32_e32 v3, v3, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v20 +; GFX6-NEXT: v_min_u32_e32 v4, v4, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v21 +; GFX6-NEXT: v_min_u32_e32 v5, v5, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v22 +; GFX6-NEXT: v_min_u32_e32 v6, v6, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v23 +; GFX6-NEXT: v_min_u32_e32 v7, v7, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v24 +; GFX6-NEXT: v_min_u32_e32 v8, v8, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v25 +; GFX6-NEXT: v_min_u32_e32 v9, v9, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v26 +; GFX6-NEXT: v_min_u32_e32 v10, v10, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v27 +; GFX6-NEXT: v_min_u32_e32 v11, v11, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v28 +; GFX6-NEXT: v_min_u32_e32 v12, v12, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v29 +; GFX6-NEXT: v_min_u32_e32 v13, v13, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v30 +; GFX6-NEXT: v_min_u32_e32 v14, v14, v16 +; GFX6-NEXT: v_not_b32_e32 v16, v31 +; GFX6-NEXT: v_min_u32_e32 v15, v15, v16 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v17 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v18 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v19 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v20 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v21 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v22 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v23 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v24 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v25 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v26 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v27 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v28 +; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v29 +; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v30 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v31 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_v16i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v17 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v18 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v19 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v20 clamp +; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v21 clamp +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v6, v22 clamp +; GFX8-NEXT: v_add_u32_e64 v7, s[4:5], v7, v23 clamp +; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], v8, v24 clamp +; GFX8-NEXT: v_add_u32_e64 v9, s[4:5], v9, v25 clamp +; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], v10, v26 clamp +; GFX8-NEXT: v_add_u32_e64 v11, s[4:5], v11, v27 clamp +; GFX8-NEXT: v_add_u32_e64 v12, s[4:5], v12, v28 clamp +; GFX8-NEXT: v_add_u32_e64 v13, s[4:5], v13, v29 clamp +; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30 clamp +; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v31 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v17 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v18 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v19 clamp +; GFX9-NEXT: v_add_u32_e64 v4, v4, v20 clamp +; GFX9-NEXT: v_add_u32_e64 v5, v5, v21 clamp +; GFX9-NEXT: v_add_u32_e64 v6, v6, v22 clamp +; GFX9-NEXT: v_add_u32_e64 v7, v7, v23 clamp +; GFX9-NEXT: v_add_u32_e64 v8, v8, v24 clamp +; GFX9-NEXT: v_add_u32_e64 v9, v9, v25 clamp +; GFX9-NEXT: v_add_u32_e64 v10, v10, v26 clamp +; GFX9-NEXT: v_add_u32_e64 v11, v11, v27 clamp +; GFX9-NEXT: v_add_u32_e64 v12, v12, v28 clamp +; GFX9-NEXT: v_add_u32_e64 v13, v13, v29 clamp +; GFX9-NEXT: v_add_u32_e64 v14, v14, v30 clamp +; GFX9-NEXT: v_add_u32_e64 v15, v15, v31 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) + ret <16 x i32> %result +} + + +define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_uaddsat_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_uaddsat_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_uaddsat_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %result +} + +declare i8 @llvm.uadd.sat.i8(i8, i8) #0 +declare i16 @llvm.uadd.sat.i16(i16, i16) #0 +declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) #0 +declare <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16>, <3 x i16>) #0 +declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) #0 +declare i32 @llvm.uadd.sat.i32(i32, i32) #0 +declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) #0 +declare <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32>, <3 x i32>) #0 +declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) #0 +declare <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32>, <8 x i32>) #0 +declare <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32>, <16 x i32>) #0 +declare i64 @llvm.uadd.sat.i64(i64, i64) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index 559f1092e6e44..fe613f2579e45 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -39,6 +39,7 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 ; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x1d ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6 @@ -50,14 +51,12 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x1d ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll new file mode 100644 index 0000000000000..97bfdc479e090 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -0,0 +1,492 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { +; GFX6-LABEL: v_usubsat_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) + ret i8 %result +} + +define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { +; GFX6-LABEL: v_usubsat_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %result +} + +define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) { +; GFX6-LABEL: v_usubsat_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %result +} + +define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { +; GFX6-LABEL: v_usubsat_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v5 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) + ret <2 x i16> %result +} + +define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { +; GFX6-LABEL: v_usubsat_v3i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v7, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v8 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v7 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v2, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_v3i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp +; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) + ret <3 x i16> %result +} + +define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; GFX6-LABEL: v_usubsat_v4i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v10, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v11, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v11 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v10 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v9, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v8 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_max_u32_e32 v1, v3, v9 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp +; GFX8-NEXT: v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %cast = bitcast <4 x i16> %result to <2 x float> + ret <2 x float> %cast +} + +define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; GFX6-LABEL: v_usubsat_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %result +} + +define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { +; GFX6-LABEL: v_usubsat_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) + ret <3 x i32> %result +} + +define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; GFX6-LABEL: v_usubsat_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v5 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v7 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + ret <4 x i32> %result +} + +define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { +; GFX6-LABEL: v_usubsat_v8i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_max_u32_e32 v0, v0, v8 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v10 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v11 +; GFX6-NEXT: v_max_u32_e32 v4, v4, v12 +; GFX6-NEXT: v_max_u32_e32 v5, v5, v13 +; GFX6-NEXT: v_max_u32_e32 v6, v6, v14 +; GFX6-NEXT: v_max_u32_e32 v7, v7, v15 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v11 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v13 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v14 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v15 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_v8i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v8 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v9 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v10 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v11 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v12 clamp +; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v13 clamp +; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v14 clamp +; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v15 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v8 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v9 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v10 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v11 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, v4, v12 clamp +; GFX9-NEXT: v_sub_u32_e64 v5, v5, v13 clamp +; GFX9-NEXT: v_sub_u32_e64 v6, v6, v14 clamp +; GFX9-NEXT: v_sub_u32_e64 v7, v7, v15 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) + ret <8 x i32> %result +} + +define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { +; GFX6-LABEL: v_usubsat_v16i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_max_u32_e32 v0, v0, v16 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v17 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v18 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v19 +; GFX6-NEXT: v_max_u32_e32 v4, v4, v20 +; GFX6-NEXT: v_max_u32_e32 v5, v5, v21 +; GFX6-NEXT: v_max_u32_e32 v6, v6, v22 +; GFX6-NEXT: v_max_u32_e32 v7, v7, v23 +; GFX6-NEXT: v_max_u32_e32 v8, v8, v24 +; GFX6-NEXT: v_max_u32_e32 v9, v9, v25 +; GFX6-NEXT: v_max_u32_e32 v10, v10, v26 +; GFX6-NEXT: v_max_u32_e32 v11, v11, v27 +; GFX6-NEXT: v_max_u32_e32 v12, v12, v28 +; GFX6-NEXT: v_max_u32_e32 v13, v13, v29 +; GFX6-NEXT: v_max_u32_e32 v14, v14, v30 +; GFX6-NEXT: v_max_u32_e32 v15, v15, v31 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v18 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v19 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v20 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v21 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v22 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v23 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v24 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v25 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v26 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v27 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v28 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v29 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v30 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v31 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_v16i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp +; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp +; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp +; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp +; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp +; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp +; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp +; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp +; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp +; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp +; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v31 clamp +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp +; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp +; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp +; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp +; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp +; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp +; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp +; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp +; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp +; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp +; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp +; GFX9-NEXT: v_sub_u32_e64 v15, v15, v31 clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) + ret <16 x i32> %result +} + + +define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_usubsat_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_usubsat_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_usubsat_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %result +} + +declare i8 @llvm.usub.sat.i8(i8, i8) #0 +declare i16 @llvm.usub.sat.i16(i16, i16) #0 +declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0 +declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0 +declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0 +declare i32 @llvm.usub.sat.i32(i32, i32) #0 +declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0 +declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0 +declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0 +declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) #0 +declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0 +declare i64 @llvm.usub.sat.i64(i64, i64) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/ARM/add-sub-imm.ll b/llvm/test/CodeGen/ARM/add-sub-imm.ll new file mode 100644 index 0000000000000..71a860537ba1b --- /dev/null +++ b/llvm/test/CodeGen/ARM/add-sub-imm.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s --check-prefix=CHECK + +;; Check how immediates are handled in add/sub. + +define i32 @sub0(i32 %0) { +; CHECK-LABEL: sub0: +; CHECK: @ %bb.0: +; CHECK-NEXT: sub r0, r0, #23 +; CHECK-NEXT: mov pc, lr + %2 = sub i32 %0, 23 + ret i32 %2 +} + +define i32 @sub1(i32 %0) { +; CHECK-LABEL: sub1: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r1, .LCPI1_0 +; CHECK-NEXT: add r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 4294836225 @ 0xfffe0001 + %2 = sub i32 %0, 131071 + ret i32 %2 +} + +define i32 @sub2(i32 %0) { +; CHECK-LABEL: sub2: +; CHECK: @ %bb.0: +; CHECK-NEXT: sub r0, r0, #35 +; CHECK-NEXT: sub r0, r0, #8960 +; CHECK-NEXT: mov pc, lr + %2 = sub i32 %0, 8995 + ret i32 %2 +} + +define i32 @add0(i32 %0) { +; CHECK-LABEL: add0: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r0, #23 +; CHECK-NEXT: mov pc, lr + %2 = add i32 %0, 23 + ret i32 %2 +} + +define i32 @add1(i32 %0) { +; CHECK-LABEL: add1: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r1, .LCPI4_0 +; CHECK-NEXT: add r0, r0, r1 +; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI4_0: +; CHECK-NEXT: .long 131071 @ 0x1ffff + %2 = add i32 %0, 131071 + ret i32 %2 +} + +define i32 @add2(i32 %0) { +; CHECK-LABEL: add2: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r0, r0, #8960 +; CHECK-NEXT: add r0, r0, #2293760 +; CHECK-NEXT: mov pc, lr + %2 = add i32 %0, 2302720 + ret i32 %2 +} diff --git a/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir b/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir new file mode 100644 index 0000000000000..c5a8fabfdc79b --- /dev/null +++ b/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir @@ -0,0 +1,13 @@ +# RUN: llc -mtriple=arm-eabi -O1 -run-pass=machine-cp %s -o - \ +# RUN: -verify-machineinstrs -simplify-mir | FileCheck %s + +name: h +body: | + bb.0: + + dead renamable $r9, renamable $r0 = UMULL renamable $lr, killed renamable $r0, 14 /* CC::al */, $noreg, $noreg + + ; CHECK: dead renamable $r9, renamable $r0 = UMULL renamable $lr, killed renamable $r0, 14 /* CC::al */, $noreg, $noreg + + renamable $r9 = COPY killed renamable $r0 +... diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll index 628bae6e1dc93..5cdfe3326e922 100644 --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -85,7 +85,8 @@ entry: define i32 @t4(i32 %a, i32 %b, i32 %x) nounwind { entry: ; ARM-LABEL: t4: -; ARM: ldr +; ARM: mvn [[R0:r[0-9]+]], #170 +; ARM: sub [[R0:r[0-9]+]], [[R0:r[0-9]+]], #11141120 ; ARM: mov{{lt|ge}} ; ARMT2-LABEL: t4: diff --git a/llvm/test/CodeGen/ARM/ssat-with-shift.ll b/llvm/test/CodeGen/ARM/ssat-with-shift.ll new file mode 100644 index 0000000000000..03670eb7e0075 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ssat-with-shift.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+dsp %s -o - | FileCheck %s + +define arm_aapcs_vfpcc i32 @ssat_lsl(i32 %num){ +; CHECK-LABEL: ssat_lsl +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ssat r0, #8, r0, lsl #7 +; CHECK-NEXT: bx lr +entry: + %shl = shl i32 %num, 7 + %0 = tail call i32 @llvm.arm.ssat(i32 %shl, i32 8) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @ssat_asr(i32 %num){ +; CHECK-LABEL: ssat_asr +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ssat r0, #8, r0, asr #7 +; CHECK-NEXT: bx lr +entry: + %shr = ashr i32 %num, 7 + %0 = tail call i32 @llvm.arm.ssat(i32 %shr, i32 8) + ret i32 %0 +} + +declare i32 @llvm.arm.ssat(i32, i32) diff --git a/llvm/test/CodeGen/ARM/thumb2-it-block.ll b/llvm/test/CodeGen/ARM/thumb2-it-block.ll index 6d93869ec10fd..c1c1c74dafd17 100644 --- a/llvm/test/CodeGen/ARM/thumb2-it-block.ll +++ b/llvm/test/CodeGen/ARM/thumb2-it-block.ll @@ -16,9 +16,7 @@ entry: ; CHECK: cmp ; CHECK-NEXT: it mi -; We shouldn't need to check for the extra 's' here; tRSB should be printed as -; "rsb" inside an IT block, not "rsbs". -; CHECK-NEXT: rsb{{s?}}mi +; CHECK-NEXT: rsbmi ; CHECK-NEXT: cmp ; CHECK-NEXT: it mi ; CHECK-NEXT: rsb{{s?}}mi diff --git a/llvm/test/CodeGen/ARM/usat-with-shift.ll b/llvm/test/CodeGen/ARM/usat-with-shift.ll new file mode 100644 index 0000000000000..56444394157a8 --- /dev/null +++ b/llvm/test/CodeGen/ARM/usat-with-shift.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+dsp %s -o - | FileCheck %s + +define arm_aapcs_vfpcc i32 @usat_lsl(i32 %num){ +; CHECK-LABEL: usat_lsl +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: usat r0, #7, r0, lsl #2 +; CHECK-NEXT: bx lr +entry: + %shl = shl i32 %num, 2 + %0 = tail call i32 @llvm.arm.usat(i32 %shl, i32 7) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @usat_asr(i32 %num){ +; CHECK-LABEL: usat_asr +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: usat r0, #7, r0, asr #2 +; CHECK-NEXT: bx lr +entry: + %shr = ashr i32 %num, 2 + %0 = tail call i32 @llvm.arm.usat(i32 %shr, i32 7) + ret i32 %0 +} + +declare i32 @llvm.arm.usat(i32, i32) diff --git a/llvm/test/CodeGen/Hexagon/atomic-store-byte.ll b/llvm/test/CodeGen/Hexagon/atomic-store-byte.ll new file mode 100644 index 0000000000000..e3febe0264ad4 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/atomic-store-byte.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=hexagon < %s | FileCheck %s + +; Test that the address for a store conditional for a byte is aligned +; correctly to use the memw_locked instruction. + +; CHECK: [[REG:(r[0-9]+)]] = and(r{{[0-9]+}},#-4) +; CHECK: = memw_locked([[REG]]) +; CHECK: memw_locked([[REG]],p{{[0-4]}}) = + +@foo.a00 = internal global i8 0, align 1 + +; Function Attrs: nofree norecurse nounwind +define dso_local void @foo() local_unnamed_addr #0 { +entry: + %0 = cmpxchg volatile i8* @foo.a00, i8 0, i8 1 seq_cst seq_cst + ret void +} + diff --git a/llvm/test/CodeGen/Hexagon/funnel-shift.ll b/llvm/test/CodeGen/Hexagon/funnel-shift.ll index b4b8f4726021f..6ad3b8a30519e 100644 --- a/llvm/test/CodeGen/Hexagon/funnel-shift.ll +++ b/llvm/test/CodeGen/Hexagon/funnel-shift.ll @@ -19,8 +19,8 @@ b0: } ; CHECK-LABEL: f2: -; CHECK: r[[R20:[0-9]+]]:[[R21:[0-9]+]] = asl(r3:2,#17) -; CHECK: r[[R20]]:[[R21]] |= lsr(r1:0,#47) +; CHECK: r[[R20:[0-9]+]]:[[R21:[0-9]+]] = asl(r1:0,#17) +; CHECK: r[[R20]]:[[R21]] |= lsr(r3:2,#47) define i64 @f2(i64 %a0, i64 %a1) #1 { b0: %v0 = tail call i64 @llvm.fshl.i64(i64 %a0, i64 %a1, i64 17) @@ -28,9 +28,9 @@ b0: } ; CHECK-LABEL: f3: -; CHECK: r[[R30:[0-9]+]]:[[R31:[0-9]+]] = asl(r3:2,r4) +; CHECK: r[[R30:[0-9]+]]:[[R31:[0-9]+]] = asl(r1:0,r4) ; CHECK: r[[R32:[0-9]+]] = sub(#64,r4) -; CHECK: r[[R30]]:[[R31]] |= lsr(r1:0,r[[R32]]) +; CHECK: r[[R30]]:[[R31]] |= lsr(r3:2,r[[R32]]) define i64 @f3(i64 %a0, i64 %a1, i64 %a2) #1 { b0: %v0 = tail call i64 @llvm.fshl.i64(i64 %a0, i64 %a1, i64 %a2) diff --git a/llvm/test/CodeGen/Hexagon/funnel-shift2.ll b/llvm/test/CodeGen/Hexagon/funnel-shift2.ll new file mode 100644 index 0000000000000..b3032dbb7efd0 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/funnel-shift2.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple=hexagon < %s | FileCheck %s + +; This patch corrects the order of operands in the pattern that lowers funnel +; shift-left. + +; CHECK: r[[R17:[0-9]+]]:[[R16:[0-9]+]] = combine +; CHECK: call parity +; CHECK: r[[R1:[0-9]+]]:[[R0:[0-9]+]] = asl(r[[R1]]:[[R0]],#63) +; CHECK: r[[R1]]:[[R0]] |= lsr(r[[R17]]:[[R16]],#1) + +target triple = "hexagon-unknown-unknown-elf" + +define dso_local i64 @fshl(i64 %x, i64 %y) { +entry: + %x.addr = alloca i64, align 8 + %y.addr = alloca i64, align 8 + store i64 %x, i64* %x.addr, align 8 + store i64 %y, i64* %y.addr, align 8 + %0 = load i64, i64* %x.addr, align 8 + %shr = lshr i64 %0, 1 + %1 = load i64, i64* %x.addr, align 8 + %2 = load i64, i64* %y.addr, align 8 + %call = call i64 @parity(i64 %1, i64 %2) + %shl = shl i64 %call, 63 + %or = or i64 %shr, %shl + store i64 %or, i64* %x.addr, align 8 + %3 = load i64, i64* %x.addr, align 8 + ret i64 %3 +} + +declare dso_local i64 @parity(i64, i64) diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index e39ff3b350c80..5d4eae6fca117 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -12,6 +12,8 @@ # FULL-NEXT: noSignedZerosFPMath: false # FULL-NEXT: memoryBound: true # FULL-NEXT: waveLimiter: true +# FULL-NEXT: hasSpilledSGPRs: false +# FULL-NEXT: hasSpilledVGPRs: false # FULL-NEXT: scratchRSrcReg: '$sgpr8_sgpr9_sgpr10_sgpr11' # FULL-NEXT: frameOffsetReg: '$sgpr12' # FULL-NEXT: stackPtrOffsetReg: '$sgpr13' @@ -83,6 +85,8 @@ body: | # FULL-NEXT: noSignedZerosFPMath: false # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false +# FULL-NEXT: hasSpilledSGPRs: false +# FULL-NEXT: hasSpilledVGPRs: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' @@ -121,6 +125,8 @@ body: | # FULL-NEXT: noSignedZerosFPMath: false # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false +# FULL-NEXT: hasSpilledSGPRs: false +# FULL-NEXT: hasSpilledVGPRs: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' @@ -160,6 +166,8 @@ body: | # FULL-NEXT: noSignedZerosFPMath: false # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false +# FULL-NEXT: hasSpilledSGPRs: false +# FULL-NEXT: hasSpilledVGPRs: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' @@ -259,3 +267,21 @@ body: | S_ENDPGM 0 ... + + +--- +# ALL-LABEL: name: parse_spilled_regs +# ALL: machineFunctionInfo: +# ALL: hasSpilledSGPRs: true +# ALL-NEXT: hasSpilledVGPRs: true + +name: parse_spilled_regs +machineFunctionInfo: + hasSpilledSGPRs: true + hasSpilledVGPRs: true + +body: | + bb.0: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index 94f926e451a9e..b6a0476389429 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -15,6 +15,8 @@ ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false ; CHECK-NEXT: waveLimiter: false +; CHECK-NEXT: hasSpilledSGPRs: false +; CHECK-NEXT: hasSpilledVGPRs: false ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' ; CHECK-NEXT: frameOffsetReg: '$fp_reg' ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' @@ -48,6 +50,8 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false ; CHECK-NEXT: waveLimiter: false +; CHECK-NEXT: hasSpilledSGPRs: false +; CHECK-NEXT: hasSpilledVGPRs: false ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' ; CHECK-NEXT: frameOffsetReg: '$fp_reg' ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' @@ -76,6 +80,8 @@ define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false ; CHECK-NEXT: waveLimiter: false +; CHECK-NEXT: hasSpilledSGPRs: false +; CHECK-NEXT: hasSpilledVGPRs: false ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' ; CHECK-NEXT: frameOffsetReg: '$sgpr33' ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' @@ -103,6 +109,8 @@ define void @function() { ; CHECK-NEXT: noSignedZerosFPMath: true ; CHECK-NEXT: memoryBound: false ; CHECK-NEXT: waveLimiter: false +; CHECK-NEXT: hasSpilledSGPRs: false +; CHECK-NEXT: hasSpilledVGPRs: false ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' ; CHECK-NEXT: frameOffsetReg: '$sgpr33' ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' diff --git a/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll b/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll index 69f9cff5c525f..d93f192b1274d 100644 --- a/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll +++ b/llvm/test/CodeGen/PowerPC/2012-11-16-mischedcall.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -enable-misched < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -enable-misched < %s | FileCheck %s ; ; PR14315: misched should not move the physreg copy of %t below the calls. diff --git a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir index 738aa1df5dd9d..a0139879f8c91 100644 --- a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir +++ b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir @@ -55,7 +55,7 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #1 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } !llvm.module.flags = !{!0, !1} diff --git a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir index bcd51d31c6cfd..01ce79995512a 100644 --- a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir +++ b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir @@ -30,7 +30,7 @@ ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #1 - attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll b/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll deleted file mode 100644 index 17e3df6d58ccc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/a2q-stackalign.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2 | FileCheck -check-prefix=CHECK-A2 %s -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2q | FileCheck -check-prefix=CHECK-A2Q %s -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-bgq-linux -mcpu=a2 | FileCheck -check-prefix=CHECK-BGQ %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -declare i32 @bar(i8* %a) nounwind; -define i32 @foo() nounwind { - %p = alloca i8, i8 115 - store i8 0, i8* %p - %r = call i32 @bar(i8* %p) - ret i32 %r -} - -; Without QPX, the allocated stack frame is 240 bytes, but with QPX -; (because we require 32-byte alignment), it is 256 bytes. -; CHECK-A2: @foo -; CHECK-A2: stdu 1, -240(1) -; CHECK-A2Q: @foo -; CHECK-A2Q: stdu 1, -256(1) -; CHECK-BGQ: @foo -; CHECK-BGQ: stdu 1, -256(1) - diff --git a/llvm/test/CodeGen/PowerPC/a2q.ll b/llvm/test/CodeGen/PowerPC/a2q.ll deleted file mode 100644 index 84e2dfa991d78..0000000000000 --- a/llvm/test/CodeGen/PowerPC/a2q.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2q | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64-- -mcpu=a2 -mattr=+qpx | FileCheck %s - -define void @foo() { -entry: - ret void -} - -; CHECK: @foo - diff --git a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll index 1b0ea26f1fdea..d629148535aa7 100644 --- a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll +++ b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll @@ -298,7 +298,7 @@ _ZN10SubProcess12SafeSyscalls5fcntlEiil.exit: ; preds = %_ZN10SubProcess12Sa ; Function Attrs: nounwind argmemonly declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind argmemonly } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll index e9155f236c9e1..6372f9ef17d6c 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll @@ -36,18 +36,18 @@ entry: ; CHECK32: bb.0.entry: ; CHECK32-NEXT: liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10 +; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 84, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r3, 0, %fixed-stack.0 :: (store 4 into %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r4, 4, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 4 +; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 80, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r5, 8, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 8 ; CHECK32-DAG: STW killed renamable $r6, 12, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 12 ; CHECK32-DAG: STW renamable $r7, 16, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 16 ; CHECK32-DAG: STW renamable $r8, 20, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 20 ; CHECK32-DAG: STW killed renamable $r9, 24, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 24 -; CHECK32-DAG: STW killed renamable $r10, 28, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 28 -; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 84, %fixed-stack.0 -; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 80, %fixed-stack.0 ; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG1]], implicit-def $carry ; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG2]], implicit-def dead $carry, implicit killed $carry +; CHECK32 STW killed renamable $r10, 28, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 28 ; CHECK32: BLR implicit $lr, implicit $rm, implicit $r3, implicit $r4 @@ -68,6 +68,7 @@ entry: ; CHECK64: bb.0.entry: ; CHECK64: liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10 +; CHECK64: renamable $x[[REG1:[0-9]+]] = LD 80, %fixed-stack.0 ; CHECK64: STD killed renamable $x3, 0, %fixed-stack.0 :: (store 8 into %fixed-stack.0 ; CHECK64: STD killed renamable $x4, 8, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 8 ; CHECK64: STD renamable $x5, 16, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 16 @@ -75,7 +76,6 @@ entry: ; CHECK64: STD killed renamable $x7, 32, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 32 ; CHECK64: STD killed renamable $x8, 40, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 40 ; CHECK64: STD killed renamable $x9, 48, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 48 -; CHECK64: STD killed renamable $x10, 56, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 56 -; CHECK64: renamable $x[[REG1:[0-9]+]] = LD 80, %fixed-stack.0 ; CHECK64: renamable $x3 = ADD8 killed renamable $x5, killed renamable $x[[REG1]] +; CHECK64: STD killed renamable $x10, 56, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 56 ; CHECK64: BLR8 implicit $lr8, implicit $rm, implicit $x3 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll index dd401d159fc9e..bccb28ef22906 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll @@ -756,15 +756,15 @@ entry: ; 32BIT: bb.0.entry: ; 32BIT-NEXT: liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10 +; 32BIT: STW killed renamable $r8, 20, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 20 ; 32BIT-DAG: STW killed renamable $r3, 0, %fixed-stack.0 :: (store 4 into %fixed-stack.0 ; 32BIT-DAG: STW killed renamable $r4, 4, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 4 ; 32BIT-DAG: STW killed renamable $r5, 8, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 8 ; 32BIT-DAG: STW killed renamable $r6, 12, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 12 ; 32BIT-DAG: STW killed renamable $r7, 16, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 16 -; 32BIT-DAG: STW killed renamable $r8, 20, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 20 +; 32BIT: renamable $r3 = LBZ 21, %fixed-stack.0 :: (dereferenceable load 1 ; 32BIT-DAG: STW killed renamable $r9, 24, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 24 ; 32BIT-DAG: STW killed renamable $r10, 28, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 28 -; 32BIT: renamable $r3 = LBZ 21, %fixed-stack.0 :: (dereferenceable load 1 ; 32BIT: BLR ; 64BIT: fixedStack: @@ -773,11 +773,11 @@ entry: ; 64BIT: bb.0.entry: ; 64BIT-NEXT: liveins: $x3, $x4, $x5, $x6 +; 64BIT: STD killed renamable $x5, 16, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 16 ; 64BIT-DAG: STD killed renamable $x3, 0, %fixed-stack.0 :: (store 8 into %fixed-stack.0 +; 64BIT-NEXT: renamable $x3 = LBZ8 21, %fixed-stack.0 :: (dereferenceable load 1 ; 64BIT-DAG: STD killed renamable $x4, 8, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 8 -; 64BIT-DAG: STD killed renamable $x5, 16, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 16 ; 64BIT-DAG: STD killed renamable $x6, 24, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 24 -; 64BIT-NEXT: renamable $x3 = LBZ8 21, %fixed-stack.0 :: (dereferenceable load 1 ; 64BIT-NEXT: BLR8 ; ASM-LABEL: .test_byval_32Byte: @@ -901,9 +901,9 @@ entry: ; 32BIT-DAG: STW killed renamable $r6, 12, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 12 ; 32BIT-DAG: STW killed renamable $r7, 16, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 16 ; 32BIT-DAG: STW killed renamable $r8, 20, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 20 +; 32BIT-NEXT: renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load 8 ; 32BIT-DAG: STW killed renamable $r9, 24, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 24 ; 32BIT-DAG: STW killed renamable $r10, 28, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 28 -; 32BIT-NEXT: renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load 8 ; 32BIT-NEXT: BLR ; 64BIT: fixedStack: @@ -913,10 +913,10 @@ entry: ; 64BIT: bb.0.entry: ; 64BIT-NEXT: liveins: $x3, $x4, $x5, $x6 ; 64BIT-DAG: STD killed renamable $x3, 0, %fixed-stack.0 :: (store 8 into %fixed-stack.0 -; 64BIT-DAG: STD killed renamable $x4, 8, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 8 ; 64BIT-DAG: STD killed renamable $x5, 16, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 16 -; 64BIT-DAG: STD killed renamable $x6, 24, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 24 ; 64BIT-NEXT: renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load 8 +; 64BIT-DAG: STD killed renamable $x4, 8, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 8 +; 64BIT-DAG: STD killed renamable $x6, 24, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 24 ; 64BIT-NEXT: BLR8 ; ASM32-LABEL: .test_byval_31Byte: diff --git a/llvm/test/CodeGen/PowerPC/aix-complex.ll b/llvm/test/CodeGen/PowerPC/aix-complex.ll new file mode 100644 index 0000000000000..9723614f68994 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-complex.ll @@ -0,0 +1,125 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff < %s | \ +; RUN: FileCheck --check-prefixes=CHECK,32BIT %s +; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff < %s | \ +; RUN: FileCheck --check-prefixes=CHECK,64BIT %s + +@gcd = external global { double, double }, align 8 +@gcf = external global { float, float }, align 4 +@gcfp128 = external global { ppc_fp128, ppc_fp128 }, align 16 + +declare void @anchor(...) + +define dso_local { double, double } @dblCmplxRetCallee() { +entry: + %retval = alloca { double, double }, align 8 + %retval.realp = getelementptr inbounds { double, double }, { double, double }* %retval, i32 0, i32 0 + store double 1.000000e+00, double* %retval.realp, align 8 + %retval.imagp = getelementptr inbounds { double, double }, { double, double }* %retval, i32 0, i32 1 + store double 0.000000e+00, double* %retval.imagp, align 8 + %0 = load { double, double }, { double, double }* %retval, align 8 + ret { double, double } %0 +} + +; CHECK-LABEL: .dblCmplxRetCallee: + +; CHECK-DAG: lfs 1, +; CHECK-DAG: lfs 2, +; CHECK: blr + +define dso_local void @dblCmplxRetCaller() { +entry: + %call = call { double, double } @dblCmplxRetCallee() + %0 = extractvalue { double, double } %call, 0 + %1 = extractvalue { double, double } %call, 1 + store double %0, double* getelementptr inbounds ({ double, double }, { double, double }* @gcd, i32 0, i32 0), align 8 + store double %1, double* getelementptr inbounds ({ double, double }, { double, double }* @gcd, i32 0, i32 1), align 8 + call void bitcast (void (...)* @anchor to void ()*)() + ret void +} + +; CHECK-LABEL: .dblCmplxRetCaller: + +; CHECK: bl .dblCmplxRetCallee +; 32BIT-NEXT: lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2) +; 64BIT-NEXT: ld [[REG:[0-9]+]], L..C{{[0-9]+}}(2) +; CHECK-DAG: stfd 1, 0([[REG]]) +; CHECK-DAG: stfd 2, 8([[REG]]) +; CHECK-NEXT: bl .anchor + +define dso_local { float, float } @fltCmplxRetCallee() { +entry: + %retval = alloca { float, float }, align 4 + %retval.realp = getelementptr inbounds { float, float }, { float, float }* %retval, i32 0, i32 0 + %retval.imagp = getelementptr inbounds { float, float }, { float, float }* %retval, i32 0, i32 1 + store float 1.000000e+00, float* %retval.realp, align 4 + store float 0.000000e+00, float* %retval.imagp, align 4 + %0 = load { float, float }, { float, float }* %retval, align 4 + ret { float, float } %0 +} + +; CHECK-LABEL: .fltCmplxRetCallee: + +; CHECK-DAG: lfs 1, +; CHECK-DAG: lfs 2, +; CHECK: blr + +define dso_local void @fltCmplxRetCaller() { +entry: + %call = call { float, float } @fltCmplxRetCallee() + %0 = extractvalue { float, float } %call, 0 + %1 = extractvalue { float, float } %call, 1 + store float %0, float* getelementptr inbounds ({ float, float }, { float, float }* @gcf, i32 0, i32 0), align 4 + store float %1, float* getelementptr inbounds ({ float, float }, { float, float }* @gcf, i32 0, i32 1), align 4 + call void bitcast (void (...)* @anchor to void ()*)() + ret void +} + +; CHECK-LABEL: .fltCmplxRetCaller: + +; CHECK: bl .fltCmplxRetCallee +; 32BIT-NEXT: lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2) +; 64BIT-NEXT: ld [[REG:[0-9]+]], L..C{{[0-9]+}}(2) +; CHECK-DAG: stfs 1, 0([[REG]]) +; CHECK-DAG: stfs 2, 4([[REG]]) +; CHECK-NEXT: bl .anchor + +define dso_local { ppc_fp128, ppc_fp128 } @fp128CmplxRetCallee() { +entry: + %retval = alloca { ppc_fp128, ppc_fp128 }, align 16 + %retval.realp = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* %retval, i32 0, i32 0 + %retval.imagp = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* %retval, i32 0, i32 1 + store ppc_fp128 0xM7ffeffffffffffffffffffffffffffff, ppc_fp128* %retval.realp, align 16 + store ppc_fp128 0xM3ffefffffffffffffffffffffffffffe, ppc_fp128* %retval.imagp, align 16 + %0 = load { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* %retval, align 16 + ret { ppc_fp128, ppc_fp128 } %0 +} + +; CHECK-LABEL: .fp128CmplxRetCallee: + +; CHECK-DAG: lfd 1, +; CHECK-DAG: lfd 2, +; CHECK-DAG: lfd 3, +; CHECK-DAG: lfd 4, +; CHECK: blr + +define dso_local void @fp128CmplxRetCaller() { +entry: + %call = call { ppc_fp128, ppc_fp128 } @fp128CmplxRetCallee() + %0 = extractvalue { ppc_fp128, ppc_fp128 } %call, 0 + %1 = extractvalue { ppc_fp128, ppc_fp128 } %call, 1 + store ppc_fp128 %0, ppc_fp128* getelementptr inbounds ({ ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* @gcfp128, i32 0, i32 0), align 16 + store ppc_fp128 %1, ppc_fp128* getelementptr inbounds ({ ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* @gcfp128, i32 0, i32 1), align 16 + call void bitcast (void (...)* @anchor to void ()*)() + ret void +} + +; CHECK-LABEL: .fp128CmplxRetCaller: + +; CHECK: bl .fp128CmplxRetCallee +; 32BIT-NEXT: lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2) +; 64BIT-NEXT: ld [[REG:[0-9]+]], L..C{{[0-9]+}}(2) +; CHECK-DAG: stfd 1, 0([[REG]]) +; CHECK-DAG: stfd 2, 8([[REG]]) +; CHECK-DAG: stfd 3, 16([[REG]]) +; CHECK-DAG: stfd 4, 24([[REG]]) +; CHECK-NEXT: bl .anchor diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll new file mode 100644 index 0000000000000..acc521e028859 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll @@ -0,0 +1,95 @@ +; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr4 \ +; RUN: -mattr=-altivec -function-sections < %s | \ +; RUN: FileCheck --check-prefix=ASM %s +; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr4 \ +; RUN: -mattr=-altivec -function-sections < %s | \ +; RUN: FileCheck --check-prefix=ASM %s + +@alias_foo = alias void (...), bitcast (void ()* @foo to void (...)*) + +define void @foo() { +entry: + ret void +} + +define hidden void @hidden_foo() { +entry: + ret void +} + +define void @bar() { +entry: + call void @foo() + call void @static_overalign_foo() + call void bitcast (void (...)* @alias_foo to void ()*)() + call void bitcast (void (...)* @extern_foo to void ()*)() + call void @hidden_foo() + ret void +} + +declare void @extern_foo(...) + +define internal void @static_overalign_foo() align 64 { +entry: + ret void +} + +; ASM: .csect .foo[PR],2 +; ASM-NEXT: .globl foo[DS] # -- Begin function foo +; ASM-NEXT: .globl .foo[PR] +; ASM-NEXT: .align 4 +; ASM-NEXT: .csect foo[DS] +; ASM-NEXT: alias_foo: # @foo +; ASM-NEXT: .vbyte {{[0-9]+}}, .foo[PR] +; ASM-NEXT: .vbyte {{[0-9]+}}, TOC[TC0] +; ASM-NEXT: .vbyte {{[0-9]+}}, 0 +; ASM-NEXT: .csect .foo[PR],2 +; ASM-NEXT: .alias_foo: +; ASM-NEXT: # %bb.0: # %entry +; ASM-NEXT: blr +; ASM: .csect .hidden_foo[PR],2 +; ASM-NEXT: .globl hidden_foo[DS],hidden # -- Begin function hidden_foo +; ASM-NEXT: .globl .hidden_foo[PR],hidden +; ASM-NEXT: .align 4 +; ASM-NEXT: .csect hidden_foo[DS] +; ASM-NEXT: .vbyte {{[0-9]+}}, .hidden_foo[PR] # @hidden_foo +; ASM-NEXT: .vbyte {{[0-9]+}}, TOC[TC0] +; ASM-NEXT: .vbyte {{[0-9]+}}, 0 +; ASM-NEXT: .csect .hidden_foo[PR] +; ASM-NEXT: # %bb.0: # %entry +; ASM-NEXT: blr +; ASM: .csect .bar[PR],2 +; ASM-NEXT: .globl bar[DS] # -- Begin function bar +; ASM-NEXT: .globl .bar[PR] +; ASM-NEXT: .align 4 +; ASM-NEXT: .csect bar[DS] +; ASM-NEXT: .vbyte {{[0-9]+}}, .bar[PR] # @bar +; ASM-NEXT: .vbyte {{[0-9]+}}, TOC[TC0] +; ASM-NEXT: .vbyte {{[0-9]+}}, 0 +; ASM-NEXT: .csect .bar[PR],2 +; ASM-NEXT: # %bb.0: # %entry +; ASM: bl .foo[PR] +; ASM-NEXT: nop +; ASM-NEXT: bl .static_overalign_foo[PR] +; ASM-NEXT: nop +; ASM-NEXT: bl .alias_foo +; ASM-NEXT: nop +; ASM-NEXT: bl .extern_foo +; ASM-NEXT: nop +; ASM-NEXT: bl .hidden_foo[PR] +; ASM-NEXT: nop +; ASM: .csect .static_overalign_foo[PR],6 +; ASM-NEXT: .lglobl static_overalign_foo[DS] # -- Begin function static_overalign_foo +; ASM-NEXT: .lglobl .static_overalign_foo[PR] +; ASM-NEXT: .align 6 +; ASM-NEXT: .csect static_overalign_foo[DS] +; ASM-NEXT: .vbyte {{[0-9]+}}, .static_overalign_foo[PR] # @static_overalign_foo +; ASM-NEXT: .vbyte {{[0-9]+}}, TOC[TC0] +; ASM-NEXT: .vbyte {{[0-9]+}}, 0 +; ASM-NEXT: .csect .static_overalign_foo[PR],6 +; ASM-NEXT: # %bb.0: # %entry +; ASM-NEXT: blr +; ASM: .extern .extern_foo +; ASM-NEXT: .extern extern_foo[DS] +; ASM-NEXT: .globl alias_foo +; ASM-NEXT: .globl .alias_foo diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-used.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-used.ll new file mode 100644 index 0000000000000..dd0812f3d8c78 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-used.ll @@ -0,0 +1,26 @@ +;; This test verifies llc on AIX would not crash when llvm.used and +;; llvm.compiler.used is presented in the IR. + +; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mtriple powerpc-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mtriple powerpc64-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +@keep_this = internal global i32 2, align 4 +@keep_this2 = internal global i32 3, align 4 +@llvm.used = appending global [1 x i8*] [i8* bitcast (i32* @keep_this to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [1 x i8*] [i8* bitcast (i32* @keep_this2 to i8*)], section "llvm.metadata" + +; CHECK-NOT: llvm.metadata +; CHECK-NOT: llvm.used +; CHECK-NOT: llvm.compiler.used + +; CHECK: .lglobl keep_this +; CHECK: keep_this: +; CHECK: .lglobl keep_this2 +; CHECK: keep_this2: + +; CHECK-NOT: llvm.metadata +; CHECK-NOT: llvm.used +; CHECK-NOT: llvm.compiler.used diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll index c276d4ccc3952..cb1544ef9756f 100644 --- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll +++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll @@ -74,9 +74,9 @@ ; 32BIT-DAG: renamable $r11 = ADDI %fixed-stack.0, 0 ; 32BIT-DAG: STW renamable $r11, 0, %stack.0.arg1 :: (store 4 into %ir.0) ; 32BIT-DAG: STW renamable $r4, 0, %stack.0.arg1 :: (store 4 into %ir.arg1) -; 32BIT-DAG: renamable $r5 = LWZ 0, %fixed-stack.0 :: (load 4 from %ir.2) +; 32BIT-DAG: renamable $r6 = LWZ 0, %fixed-stack.0 :: (load 4 from %ir.2) ; 32BIT-DAG: renamable $r4 = LWZ 0, %fixed-stack.0 :: (load 4 from %ir.4) -; 32BIT-DAG: renamable $r3 = nsw ADD4 killed renamable $r5, killed renamable $r3 +; 32BIT-DAG: renamable $r3 = nsw ADD4 killed renamable $r6, killed renamable $r3 ; 32BIT-DAG: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4 ; 32BIT-DAG: BLR implicit $lr, implicit $rm, implicit $r3 diff --git a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg.ll index 6d3a71193fa9b..7ee8dd0a3712a 100644 --- a/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg.ll +++ b/llvm/test/CodeGen/PowerPC/aix64-cc-abi-vaarg.ll @@ -63,15 +63,15 @@ ; 64BIT-DAG: STD killed renamable $x10, 48, %fixed-stack.0 :: (store 8) ; 64BIT-DAG: renamable $x11 = ADDI8 %fixed-stack.0, 0 ; 64BIT-DAG: STD renamable $x11, 0, %stack.1.arg2 :: (store 8 into %ir.1) -; 64BIT-DAG: renamable $x4 = LD 0, %stack.1.arg2 :: (load 8 from %ir.arg2) -; 64BIT-DAG: renamable $x7 = ADDI8 renamable $x4, 4 -; 64BIT-DAG: renamable $x5 = ADDI8 %fixed-stack.0, 4 -; 64BIT-DAG: renamable $r6 = LWZ 0, %fixed-stack.0 :: (load 4 from %fixed-stack.0, align 8) +; 64BIT-DAG: renamable $x6 = LD 0, %stack.1.arg2 :: (load 8 from %ir.arg2) +; 64BIT-DAG: renamable $x9 = ADDI8 renamable $x6, 4 +; 64BIT-DAG: renamable $x7 = ADDI8 %fixed-stack.0, 4 +; 64BIT-DAG: renamable $r8 = LWZ 0, %fixed-stack.0 :: (load 4 from %fixed-stack.0, align 8) ; 64BIT-DAG: STD killed renamable $x11, 0, %stack.0.arg1 :: (store 8 into %ir.0) -; 64BIT-DAG: STD killed renamable $x5, 0, %stack.0.arg1 :: (store 8 into %ir.arg1) -; 64BIT-DAG: STD killed renamable $x7, 0, %stack.1.arg2 :: (store 8 into %ir.arg2) -; 64BIT-DAG: renamable $r4 = LWZ 0, killed renamable $x4 :: (load 4) -; 64BIT-DAG: renamable $r3 = nsw ADD4 killed renamable $r6, renamable $r3, implicit killed $x3 +; 64BIT-DAG: STD killed renamable $x7, 0, %stack.0.arg1 :: (store 8 into %ir.arg1) +; 64BIT-DAG: STD killed renamable $x9, 0, %stack.1.arg2 :: (store 8 into %ir.arg2) +; 64BIT-DAG: renamable $r4 = LWZ 0, killed renamable $x6 :: (load 4) +; 64BIT-DAG: renamable $r3 = nsw ADD4 killed renamable $r8, renamable $r3, implicit killed $x3 ; 64BIT-DAG: renamable $r4 = RLWINM killed renamable $r4, 1, 0, 30 ; 64BIT-DAG: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4, implicit-def $x3 ; 64BIT-DAG: BLR8 implicit $lr8, implicit $rm, implicit $x3 @@ -225,14 +225,14 @@ ; 64BIT-DAG: STD killed renamable $x9, 40, %fixed-stack.0 :: (store 8) ; 64BIT-DAG: STD killed renamable $x10, 48, %fixed-stack.0 :: (store 8) ; 64BIT-DAG: STD renamable $x3, 0, %stack.1.arg2 :: (store 8 into %ir.1) -; 64BIT-DAG: renamable $x4 = LD 0, %stack.1.arg2 :: (load 8 from %ir.arg2) -; 64BIT-DAG: renamable $x5 = ADDI8 %fixed-stack.0, 8 +; 64BIT-DAG: renamable $x6 = LD 0, %stack.1.arg2 :: (load 8 from %ir.arg2) +; 64BIT-DAG: renamable $x7 = ADDI8 %fixed-stack.0, 8 ; 64BIT-DAG: STD killed renamable $x3, 0, %stack.0.arg1 :: (store 8 into %ir.0) -; 64BIT-DAG: STD killed renamable $x5, 0, %stack.0.arg1 :: (store 8 into %ir.arg1) +; 64BIT-DAG: STD killed renamable $x7, 0, %stack.0.arg1 :: (store 8 into %ir.arg1) ; 64BIT-DAG: renamable $f0 = LFD 0, %fixed-stack.0 :: (load 8) -; 64BIT-DAG: renamable $x3 = ADDI8 renamable $x4, 8 +; 64BIT-DAG: renamable $x3 = ADDI8 renamable $x6, 8 ; 64BIT-DAG: STD killed renamable $x3, 0, %stack.1.arg2 :: (store 8 into %ir.arg2) -; 64BIT-DAG: renamable $f2 = LFD 0, killed renamable $x4 :: (load 8) +; 64BIT-DAG: renamable $f2 = LFD 0, killed renamable $x6 :: (load 8) ; 64BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm ; 64BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm ; 64BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm diff --git a/llvm/test/CodeGen/PowerPC/asm-Zy.ll b/llvm/test/CodeGen/PowerPC/asm-Zy.ll index 78bb0f4c73eca..c8b5e9f1aa1d1 100644 --- a/llvm/test/CodeGen/PowerPC/asm-Zy.ll +++ b/llvm/test/CodeGen/PowerPC/asm-Zy.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2 | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" define i32 @zytest(i32 %a) nounwind { entry: diff --git a/llvm/test/CodeGen/PowerPC/asm-constraints.ll b/llvm/test/CodeGen/PowerPC/asm-constraints.ll index a3e573d8935e9..da77d1a169792 100644 --- a/llvm/test/CodeGen/PowerPC/asm-constraints.ll +++ b/llvm/test/CodeGen/PowerPC/asm-constraints.ll @@ -65,7 +65,7 @@ entry: } -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/builtins-ppc-p10vsx.ll b/llvm/test/CodeGen/PowerPC/builtins-ppc-p10vsx.ll index 2ac1b2b7514bc..faddb5b4cc7f0 100644 --- a/llvm/test/CodeGen/PowerPC/builtins-ppc-p10vsx.ll +++ b/llvm/test/CodeGen/PowerPC/builtins-ppc-p10vsx.ll @@ -2,9 +2,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ ; RUN: FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O0 \ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ ; RUN: FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O0 \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-O0 ; These test cases aims to test the builtins for the Power10 VSX vector ; instructions introduced in ISA 3.1. @@ -19,6 +22,14 @@ define signext i32 @test_vec_test_lsbb_all_ones(<16 x i8> %vuca) { ; CHECK-NEXT: srwi r3, r3, 31 ; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: test_vec_test_lsbb_all_ones: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: xvtlsbb cr0, v2 +; CHECK-O0-NEXT: mfocrf r3, 128 +; CHECK-O0-NEXT: srwi r3, r3, 31 +; CHECK-O0-NEXT: extsw r3, r3 +; CHECK-O0-NEXT: blr entry: %0 = tail call i32 @llvm.ppc.vsx.xvtlsbb(<16 x i8> %vuca, i32 1) ret i32 %0 @@ -32,7 +43,199 @@ define signext i32 @test_vec_test_lsbb_all_zeros(<16 x i8> %vuca) { ; CHECK-NEXT: rlwinm r3, r3, 3, 31, 31 ; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: test_vec_test_lsbb_all_zeros: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: xvtlsbb cr0, v2 +; CHECK-O0-NEXT: mfocrf r3, 128 +; CHECK-O0-NEXT: rlwinm r3, r3, 3, 31, 31 +; CHECK-O0-NEXT: extsw r3, r3 +; CHECK-O0-NEXT: blr entry: %0 = tail call i32 @llvm.ppc.vsx.xvtlsbb(<16 x i8> %vuca, i32 0) ret i32 %0 } + +define void @vec_xst_trunc_sc(<1 x i128> %__vec, i64 %__offset, i8* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_sc: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stxvrbx v2, r6, r5 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_sc: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextubrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: add r4, r6, r5 +; CHECK-O0-NEXT: stb r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <16 x i8> + %conv = extractelement <16 x i8> %0, i32 0 + %add.ptr = getelementptr inbounds i8, i8* %__ptr, i64 %__offset + store i8 %conv, i8* %add.ptr, align 1 + ret void +} + +define void @vec_xst_trunc_uc(<1 x i128> %__vec, i64 %__offset, i8* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_uc: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stxvrbx v2, r6, r5 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_uc: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextubrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: add r4, r6, r5 +; CHECK-O0-NEXT: stb r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <16 x i8> + %conv = extractelement <16 x i8> %0, i32 0 + %add.ptr = getelementptr inbounds i8, i8* %__ptr, i64 %__offset + store i8 %conv, i8* %add.ptr, align 1 + ret void +} + +define void @vec_xst_trunc_ss(<1 x i128> %__vec, i64 %__offset, i16* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_ss: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 1 +; CHECK-NEXT: stxvrhx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_ss: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextuhrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: sldi r4, r5, 1 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: sth r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <8 x i16> + %conv = extractelement <8 x i16> %0, i32 0 + %add.ptr = getelementptr inbounds i16, i16* %__ptr, i64 %__offset + store i16 %conv, i16* %add.ptr, align 2 + ret void +} + +define void @vec_xst_trunc_us(<1 x i128> %__vec, i64 %__offset, i16* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_us: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 1 +; CHECK-NEXT: stxvrhx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_us: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextuhrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: sldi r4, r5, 1 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: sth r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <8 x i16> + %conv = extractelement <8 x i16> %0, i32 0 + %add.ptr = getelementptr inbounds i16, i16* %__ptr, i64 %__offset + store i16 %conv, i16* %add.ptr, align 2 + ret void +} + +define void @vec_xst_trunc_si(<1 x i128> %__vec, i64 %__offset, i32* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_si: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 2 +; CHECK-NEXT: stxvrwx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_si: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextuwrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: sldi r4, r5, 2 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: stw r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <4 x i32> + %conv = extractelement <4 x i32> %0, i32 0 + %add.ptr = getelementptr inbounds i32, i32* %__ptr, i64 %__offset + store i32 %conv, i32* %add.ptr, align 4 + ret void +} + +define void @vec_xst_trunc_ui(<1 x i128> %__vec, i64 %__offset, i32* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_ui: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 2 +; CHECK-NEXT: stxvrwx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_ui: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: li r3, 0 +; CHECK-O0-NEXT: vextuwrx r3, r3, v2 +; CHECK-O0-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-O0-NEXT: sldi r4, r5, 2 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: stw r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <4 x i32> + %conv = extractelement <4 x i32> %0, i32 0 + %add.ptr = getelementptr inbounds i32, i32* %__ptr, i64 %__offset + store i32 %conv, i32* %add.ptr, align 4 + ret void +} + +define void @vec_xst_trunc_sll(<1 x i128> %__vec, i64 %__offset, i64* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_sll: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 3 +; CHECK-NEXT: stxvrdx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_sll: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: mfvsrld r3, v2 +; CHECK-O0-NEXT: sldi r4, r5, 3 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: std r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <2 x i64> + %conv = extractelement <2 x i64> %0, i32 0 + %add.ptr = getelementptr inbounds i64, i64* %__ptr, i64 %__offset + store i64 %conv, i64* %add.ptr, align 8 + ret void +} + +define void @vec_xst_trunc_ull(<1 x i128> %__vec, i64 %__offset, i64* nocapture %__ptr) { +; CHECK-LABEL: vec_xst_trunc_ull: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r5, 3 +; CHECK-NEXT: stxvrdx v2, r6, r3 +; CHECK-NEXT: blr +; +; CHECK-O0-LABEL: vec_xst_trunc_ull: +; CHECK-O0: # %bb.0: # %entry +; CHECK-O0-NEXT: mfvsrld r3, v2 +; CHECK-O0-NEXT: sldi r4, r5, 3 +; CHECK-O0-NEXT: add r4, r6, r4 +; CHECK-O0-NEXT: std r3, 0(r4) +; CHECK-O0-NEXT: blr +entry: + %0 = bitcast <1 x i128> %__vec to <2 x i64> + %conv = extractelement <2 x i64> %0, i32 0 + %add.ptr = getelementptr inbounds i64, i64* %__ptr, i64 %__offset + store i64 %conv, i64* %add.ptr, align 8 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll index 11bc2bae9871f..cdd04b33318ea 100644 --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -446,5 +446,93 @@ entry: ret <16 x i8> %shuffle } +define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat4Low: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: ld r3, 0(r3) +; CHECK-P8-NEXT: mtfprd f0, r3 +; CHECK-P8-NEXT: xxspltw v2, vs0, 0 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat4Low: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r3, r3, 4 +; CHECK-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat4Low: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: vspltw v2, v2, 2 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit18 to <4 x i32> + ret <4 x i32> %1 +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat4hi: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: ld r3, 0(r3) +; CHECK-P8-NEXT: mtfprd f0, r3 +; CHECK-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat4hi: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat4hi: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: vspltw v2, v2, 3 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit22 to <4 x i32> + ret <4 x i32> %1 +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat8: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: lxvdsx v2, 0, r3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat8: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxvdsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat8: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI19_0@toc@ha +; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI19_0@toc@l +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: addi r3, r1, -16 +; CHECK-NOVSX-NEXT: lvx v3, 0, r3 +; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit30 to <2 x i64> + ret <2 x i64> %1 +} + declare double @dummy() local_unnamed_addr attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll b/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll new file mode 100644 index 0000000000000..67768bfe51f38 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs | FileCheck %s + +; FIXME: We have implemented the following patterns in DAGCombiner.cpp, +; but we can't get results as expected. + +; fold (or (and X, (xor Y, -1)), Y) to (or X, Y) +define i32 @pattern1(i32 %x, i32 %y){ +; CHECK-LABEL: pattern1: +; CHECK: # %bb.0: +; CHECK-NEXT: xori 5, 4, 65535 +; CHECK-NEXT: xoris 5, 5, 65535 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: blr + %a = xor i32 %y, -1 + %b = and i32 %x, %a + %c = or i32 %b, %y + ret i32 %c +} + +; fold (or (and (xor Y, -1), X), Y) to (or X, Y) +define i32 @pattern2(i32 %x, i32 %y){ +; CHECK-LABEL: pattern2: +; CHECK: # %bb.0: +; CHECK-NEXT: xori 5, 4, 65535 +; CHECK-NEXT: xoris 5, 5, 65535 +; CHECK-NEXT: and 3, 5, 3 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: blr + %a = xor i32 %y, -1 + %b = and i32 %a, %x + %c = or i32 %b, %y + ret i32 %c +} + +; fold (and (select Cond, 0, -1), X) to (select Cond, 0, X) +define i32 @pattern3(i1 %cond, i32 %x) { +; CHECK-LABEL: pattern3: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: andi. 3, 3, 1 +; CHECK-NEXT: oris 3, 5, 65535 +; CHECK-NEXT: ori 3, 3, 65535 +; CHECK-NEXT: iselgt 3, 0, 3 +; CHECK-NEXT: and 3, 3, 4 +; CHECK-NEXT: blr + %sel = select i1 %cond, i32 0, i32 -1 + %res = and i32 %sel, %x + ret i32 %res +} + +; fold (or X, (select Cond, -1, 0)) to (select Cond, -1, X) +define i32 @pattern4(i1 %cond, i32 %x) { +; CHECK-LABEL: pattern4: +; CHECK: # %bb.0: +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: andi. 3, 3, 1 +; CHECK-NEXT: oris 3, 5, 65535 +; CHECK-NEXT: ori 3, 3, 65535 +; CHECK-NEXT: iselgt 3, 3, 5 +; CHECK-NEXT: or 3, 4, 3 +; CHECK-NEXT: blr + %sel = select i1 %cond, i32 -1, i32 0 + %res = or i32 %x, %sel + ret i32 %res +} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir index 2081e6fd02f51..904210ee13477 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir @@ -63,8 +63,8 @@ ret i64 %2 } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir index b52e0a4103add..f46d4fc0a42a4 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir @@ -187,7 +187,7 @@ ret i64 %cond } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir index 4d2595e1abdcb..ba950dc3d3ae9 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir @@ -983,10 +983,10 @@ ret i64 %xor } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll b/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll index ed3c9f07c1a85..75640d1d26072 100644 --- a/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll +++ b/llvm/test/CodeGen/PowerPC/ctr-minmaxnum.ll @@ -1,5 +1,4 @@ ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX declare float @fabsf(float) @@ -64,11 +63,6 @@ loop_exit: ; CHECK-NOT: xsmindp ; CHECK: blr -; QPX-LABEL: test1v: -; QPX: mtctr -; QPX-NOT: bl fminf -; QPX: blr - define void @test1a(float %f, float* %fp) { entry: br label %loop_body @@ -139,11 +133,6 @@ loop_exit: ; CHECK-NOT: xsmaxdp ; CHECK: blr -; QPX-LABEL: test2v: -; QPX: mtctr -; QPX-NOT: bl fmax -; QPX: blr - define void @test2a(float %f, float* %fp) { entry: br label %loop_body diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll b/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll index 44acfcdd6e66a..636c86b815c8c 100644 --- a/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll +++ b/llvm/test/CodeGen/PowerPC/ctrloop-shortLoops.ll @@ -1,5 +1,4 @@ ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 -; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s --check-prefixes=CHECK,CHECK-A2Q ; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4 ; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result. @@ -86,11 +85,8 @@ for.body: ; preds = %entry, %for.body } ; Function Attrs: norecurse nounwind -; On core a2q, IssueWidth is 1. On core pwr8, IssueWidth is 8. -; a2q should use mtctr, but pwr8 should not use mtctr. define signext i32 @testTripCount2NonSmallLoop() { ; CHECK-LABEL: testTripCount2NonSmallLoop: -; CHECK-A2Q: mtctr ; CHECK-PWR8-NOT: mtctr ; CHECK: blr @@ -121,12 +117,9 @@ for.end: ; preds = %if.end ret i32 %conv } -; On core a2q, IssueWidth is 1. On core pwr8, IssueWidth is 8. -; a2q should use mtctr, but pwr8 should not use mtctr. define signext i32 @testTripCount5() { ; CHECK-LABEL: testTripCount5: ; CHECK-PWR8-NOT: mtctr -; CHECK-A2Q: mtctr entry: %.prea = load i32, i32* @a, align 4 diff --git a/llvm/test/CodeGen/PowerPC/ec-input.ll b/llvm/test/CodeGen/PowerPC/ec-input.ll index 9a1c121699a69..425bc1985d419 100644 --- a/llvm/test/CodeGen/PowerPC/ec-input.ll +++ b/llvm/test/CodeGen/PowerPC/ec-input.ll @@ -5,7 +5,7 @@ ; that were both inputs to the inline asm and also early-clobber outputs). target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713 = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712*, %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712 = type { %struct._IO_marker.118.8248.32638.195238.200116.211498.218002.221254.222880.224506.226132.240766.244018.245644.248896.260278.271660.281416.283042.302554.304180.325318.326944.344712*, %struct._IO_FILE.119.8249.32639.195239.200117.211499.218003.221255.222881.224507.226133.240767.244019.245645.248897.260279.271661.281417.283043.302555.304181.325319.326945.344713*, i32 } diff --git a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll index e066b45d3ca4b..023928bcb5896 100644 --- a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll +++ b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64-unknown-linux" %"class.Foam::messageStream.6" = type <{ %"class.Foam::string.5", i32, i32, i32, [4 x i8] }> %"class.Foam::string.5" = type { %"class.std::basic_string.4" } @@ -419,8 +419,8 @@ declare void @_ZN4Foam11regIOobjectD2Ev() #0 declare void @_ZN4Foam6reduceIiNS_5sumOpIiEEEEvRKNS_4ListINS_8UPstream11commsStructEEERT_RKT0_ii() #0 -attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="a2q" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll b/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll index fdd0fc2767803..b08b050f2c2fd 100644 --- a/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll +++ b/llvm/test/CodeGen/PowerPC/fast-isel-icmp-split.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs -O0 -relocation-model=pic < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"class.std::__1::__tree_node.130.151" = type { %"class.std::__1::__tree_node_base.base.128.149", %"class.boost::serialization::extended_type_info.129.150"* } %"class.std::__1::__tree_node_base.base.128.149" = type <{ %"class.std::__1::__tree_end_node.127.148", %"class.std::__1::__tree_node_base.126.147"*, %"class.std::__1::__tree_node_base.126.147"*, i8 }> diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll index eef6e0ccac02b..a336fc796ca52 100644 --- a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll +++ b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll @@ -33,4 +33,4 @@ define float @f(float %xf) #0 { ret float %25 } -attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll b/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll index 2feb4556dfab7..3b555cf898f57 100644 --- a/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll +++ b/llvm/test/CodeGen/PowerPC/fp2int2fp-ppcfp128.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define linkonce_odr double @test1(ppc_fp128 %input) { entry: diff --git a/llvm/test/CodeGen/PowerPC/future-check-features.ll b/llvm/test/CodeGen/PowerPC/future-check-features.ll index cfaf4eab93c1e..4d9b6e7555924 100644 --- a/llvm/test/CodeGen/PowerPC/future-check-features.ll +++ b/llvm/test/CodeGen/PowerPC/future-check-features.ll @@ -1,9 +1,9 @@ -; RUN: llc -mattr=pcrelative-memops,prefix-instrs -verify-machineinstrs \ -; RUN: -mtriple=powerpc64le-unknown-unknown -ppc-asm-full-reg-names \ -; RUN: %s -o - 2>&1 | FileCheck %s -; RUN: llc -mattr=pcrelative-memops,prefix-instrs -verify-machineinstrs \ -; RUN: -mtriple=powerpc64-unknown-unknown -ppc-asm-full-reg-names \ -; RUN: %s -o - 2>&1 | FileCheck %s +; RUN: llc -mattr=pcrelative-memops,prefix-instrs,paired-vector-memops \ +; RUN: -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-asm-full-reg-names %s -o - 2>&1 | FileCheck %s +; RUN: llc -mattr=pcrelative-memops,prefix-instrs,paired-vector-memops \ +; RUN: -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \ +; RUN: -ppc-asm-full-reg-names %s -o - 2>&1 | FileCheck %s define dso_local signext i32 @f() { entry: diff --git a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll index 54c3e11528b7b..2aa5239f25eb8 100644 --- a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll +++ b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -mcpu=a2 < %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %"class.std::__1::__assoc_sub_state" = type { %"class.std::__1::__shared_count", %"class.std::__exception_ptr::exception_ptr", %"class.std::__1::mutex", %"class.std::__1::condition_variable", i32 } %"class.std::__1::__shared_count" = type { i32 (...)**, i64 } diff --git a/llvm/test/CodeGen/PowerPC/hardware-loops-crash.ll b/llvm/test/CodeGen/PowerPC/hardware-loops-crash.ll new file mode 100644 index 0000000000000..23e5b44fa87d1 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/hardware-loops-crash.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -hardware-loops -S -verify-loop-lcssa %s | FileCheck %s + +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "ppc64-unknown-linux-elf" + +declare i1 @cond() readnone + +; Make sure we do not crash on the test. + +define void @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br label [[FOR_INC:%.*]] +; CHECK: for.inc: +; CHECK-NEXT: [[C_0:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_0]], label [[WHILE_COND25:%.*]], label [[FOR_BODY]] +; CHECK: while.cond25: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[LAND_RHS:%.*]] ], [ 0, [[FOR_INC]] ] +; CHECK-NEXT: [[INDVARS_IV349:%.*]] = phi i64 [ [[INDVARS_IV_NEXT350:%.*]], [[LAND_RHS]] ], [ 50, [[FOR_INC]] ] +; CHECK-NEXT: [[CMP26_NOT:%.*]] = icmp eq i64 [[INDVARS_IV349]], 0 +; CHECK-NEXT: br i1 [[CMP26_NOT]], label [[WHILE_END187:%.*]], label [[LAND_RHS]] +; CHECK: land.rhs: +; CHECK-NEXT: [[INDVARS_IV_NEXT350]] = add nsw i64 [[INDVARS_IV349]], -1 +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: br i1 [[C_1]], label [[WHILE_COND25]], label [[WHILE_END:%.*]] +; CHECK: while.end: +; CHECK-NEXT: [[INDVAR_LCSSA1:%.*]] = phi i64 [ [[INDVAR]], [[LAND_RHS]] ] +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label [[WHILE_END187]], label [[WHILE_COND35_PREHEADER:%.*]] +; CHECK: while.cond35.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = mul nsw i64 [[INDVAR_LCSSA1]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 51 +; CHECK-NEXT: call void @llvm.set.loop.iterations.i64(i64 [[TMP1]]) +; CHECK-NEXT: br label [[WHILE_COND35:%.*]] +; CHECK: while.cond35: +; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.loop.decrement.i64(i64 1) +; CHECK-NEXT: br i1 [[TMP2]], label [[LAND_RHS37:%.*]], label [[IF_END51:%.*]] +; CHECK: land.rhs37: +; CHECK-NEXT: br label [[WHILE_COND35]] +; CHECK: if.end51: +; CHECK-NEXT: br label [[WHILE_COND_BACKEDGE:%.*]] +; CHECK: while.cond.backedge: +; CHECK-NEXT: br label [[WHILE_COND]] +; CHECK: while.end187: +; CHECK-NEXT: ret void +; +entry: + br label %while.cond + +while.cond: ; preds = %while.cond.backedge, %entry + br label %for.body + +for.body: ; preds = %for.inc, %while.cond + br label %for.inc + +for.inc: ; preds = %for.body + %c.0 = call i1 @cond() + br i1 %c.0, label %while.cond25, label %for.body + +while.cond25: ; preds = %land.rhs, %for.inc + %indvars.iv349 = phi i64 [ %indvars.iv.next350, %land.rhs ], [ 50, %for.inc ] + %cmp26.not = icmp eq i64 %indvars.iv349, 0 + br i1 %cmp26.not, label %while.end187, label %land.rhs + +land.rhs: ; preds = %while.cond25 + %indvars.iv.next350 = add nsw i64 %indvars.iv349, -1 + %c.1 = call i1 @cond() + br i1 %c.1, label %while.cond25, label %while.end + +while.end: ; preds = %land.rhs + %c.2 = call i1 @cond() + br i1 %c.2, label %while.end187, label %while.cond35.preheader + +while.cond35.preheader: ; preds = %while.end + %0 = and i64 %indvars.iv349, 4294967295 + br label %while.cond35 + +while.cond35: ; preds = %land.rhs37, %while.cond35.preheader + %indvars.iv351 = phi i64 [ %0, %while.cond35.preheader ], [ %indvars.iv.next352, %land.rhs37 ] + %cmp36 = icmp sgt i64 %indvars.iv351, 0 + br i1 %cmp36, label %land.rhs37, label %if.end51 + +land.rhs37: ; preds = %while.cond35 + %indvars.iv.next352 = add nsw i64 %indvars.iv351, -1 + br label %while.cond35 + +if.end51: ; preds = %while.cond35 + br label %while.cond.backedge + +while.cond.backedge: ; preds = %if.end51 + br label %while.cond + +while.end187: ; preds = %while.end, %while.cond25 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll index 74bfa75e5e313..a2d0eb599f91d 100644 --- a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll +++ b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll @@ -33,5 +33,5 @@ declare i8* @_ZN11__sanitizer21internal_start_threadEPFvPvES0_(void (i8*)*, i8*) declare hidden void @_ZN11__sanitizer16BackgroundThreadEPv(i8* nocapture readnone) #5 -attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #7 = { nobuiltin nounwind } diff --git a/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll b/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll index e4dfd6c58f0e8..6f1bc76d816ae 100644 --- a/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll +++ b/llvm/test/CodeGen/PowerPC/inlineasm-i64-reg.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux -mcpu=a2 < %s | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %struct.BG_CoordinateMapping_t = type { [4 x i8] } diff --git a/llvm/test/CodeGen/PowerPC/livevars-crash1.mir b/llvm/test/CodeGen/PowerPC/livevars-crash1.mir index 53337040e70a8..6457a3d242362 100644 --- a/llvm/test/CodeGen/PowerPC/livevars-crash1.mir +++ b/llvm/test/CodeGen/PowerPC/livevars-crash1.mir @@ -1,6 +1,6 @@ -# RUN: not --crash llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ -# RUN: -run-pass=livevars,phi-node-elimination -verify-machineinstrs \ -# RUN: | FileCheck %s +# RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ +# RUN: -run-pass=livevars,phi-node-elimination -verify-machineinstrs | \ +# RUN: FileCheck %s --- | ; Function Attrs: noreturn nounwind @@ -82,9 +82,46 @@ body: | STD %3, 0, %4 :: (store 8 into %ir.p) B %bb.1 + ; CHECK-LABEL: name: zext_free + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $x3 + + ; CHECK: %4:g8rc_and_g8rc_nox0 = COPY killed $x3 + ; CHECK: %0:g8rc = LD 0, %4 :: (dereferenceable load 8 from %ir.p) + ; CHECK: %12:g8rc_and_g8rc_nox0 = COPY killed %0 + + ; CHECK: bb.1.loop: + ; CHECK: successors: %bb.1(0x20000000), %bb.2(0x60000000) + + ; CHECK: %1:g8rc_and_g8rc_nox0 = COPY killed %12 + ; CHECK: %5:gprc = LBZ 0, %1 :: (load 1 from %ir.0) + ; CHECK: %6:crrc = CMPWI killed %5, 0 + ; CHEXK: %7:crbitrc = COPY killed %6.sub_eq + ; CHECK: %2:g8rc = nuw ADDI8 %1, 1 + ; CHECK: STD %2, 0, %4 :: (store 8 into %ir.p) + ; CHECK: %8:gprc = LBZ 1, %1 :: (load 1 from %ir.incdec.ptr) + ; CHECK: %12:g8rc_and_g8rc_nox0 = COPY %2 + ; CHECK: BCn killed %7, %bb.1 + ; CHECK: B %bb.2 + + ; CHECK: bb.2.loop: + ; CHECK: successors: %bb.3(0x55555555), %bb.1(0x2aaaaaab) + + ; CHECK: %9:crrc = CMPWI killed %8, 0 + ; CHECK: %10:crbitrc = COPY killed %9.sub_eq + ; CHECK: %12:g8rc_and_g8rc_nox0 = COPY killed %2 + ; CHECK: BC killed %10, %bb.1 + ; CHECK: B %bb.3 + + ; CHECK: bb.3.if.then3: + ; CHECK: successors: %bb.1(0x80000000) + + ; CHECK: %3:g8rc = nuw ADDI8 killed %1, 2 + ; CHECK: STD %3, 0, %4 :: (store 8 into %ir.p) + ; CHECK: %12:g8rc_and_g8rc_nox0 = COPY killed %3 + ; CHECK: B %bb.1 + + ... -# CHECK-LABEL: Bad machine code: LiveVariables: Block should not be in AliveBlocks -# CHECK-NEXT: - function: zext_free -# CHECK-NEXT: - basic block: %bb.2 loop -# CHECK-NEXT: Virtual register %2 is not needed live through the block. -# CHECK-NEXT: LLVM ERROR: Found 1 machine code errors. + diff --git a/llvm/test/CodeGen/PowerPC/livevars-crash2.mir b/llvm/test/CodeGen/PowerPC/livevars-crash2.mir index e62db1ed4cb27..1bdf0419a8f59 100644 --- a/llvm/test/CodeGen/PowerPC/livevars-crash2.mir +++ b/llvm/test/CodeGen/PowerPC/livevars-crash2.mir @@ -1,6 +1,6 @@ -# RUN: not --crash llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ -# RUN: -run-pass=livevars,phi-node-elimination -verify-machineinstrs \ -# RUN: | FileCheck %s +# RUN: llc -mtriple powerpc64le-unknown-linux-gnu %s -o - 2>&1 \ +# RUN: -run-pass=livevars,phi-node-elimination -verify-machineinstrs | \ +# RUN: FileCheck %s --- | define float @testfloatslt(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) { @@ -176,20 +176,44 @@ body: | STD %3, 0, %4 :: (store 8 into %ir.p) B %bb.1 -... + ; CHECK-LABEL: name: testfloatslt + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $x3 + + ; CHECK: %4:g8rc_and_g8rc_nox0 = COPY killed $x3 + ; CHECK: %0:g8rc = LD 0, %4 :: (dereferenceable load 8 from %ir.p) + ; CHECK: %12:g8rc_and_g8rc_nox0 = COPY killed %0 + + ; CHECK: bb.1.loop: + ; CHECK: successors: %bb.1(0x20000000), %bb.2(0x60000000) + + ; CHECK: %1:g8rc_and_g8rc_nox0 = COPY killed %12 + ; CHECK: %5:gprc = LBZ 0, %1 :: (load 1 from %ir.0) + ; CHECK: %6:crrc = CMPWI killed %5, 0 + ; CEHCK: %7:crbitrc = COPY killed %6.sub_eq + ; CHECK: %2:g8rc = nuw ADDI8 %1, 1 + ; CHECK: STD %2, 0, %4 :: (store 8 into %ir.p) + ; CHECK: %8:gprc = LBZ 1, %1 :: (load 1 from %ir.incdec.ptr) + ; CHECK: %12:g8rc_and_g8rc_nox0 = COPY %2 + ; CHECK: BCn killed %7, %bb.1 + ; CHECK: B %bb.2 -# CHECK-LABEL: Bad machine code: LiveVariables: Block should not be in AliveBlocks -# CHECK-NEXT: - function: testfloatslt -# CHECK-NEXT: - basic block: %bb.1 entry -# CHECK-NEXT: Virtual register %4 is not needed live through the block. + ; CHECK: bb.2.loop: + ; CHECK: successors: %bb.3(0x55555555), %bb.1(0x2aaaaaab) -# CHECK-LABEL: Bad machine code: LiveVariables: Block should not be in AliveBlocks -# CHECK-NEXT: - function: testfloatslt -# CHECK-NEXT: - basic block: %bb.1 entry -# CHECK-NEXT: Virtual register %5 is not needed live through the block. + ; CHECK: %9:crrc = CMPWI killed %8, 0 + ; CHECK: %10:crbitrc = COPY killed %9.sub_eq + ; CHECK: %12:g8rc_and_g8rc_nox0 = COPY killed %2 + ; CHECK: BC killed %10, %bb.1 + ; CHECK: B %bb.3 -# CHECK-LABEL: Bad machine code: LiveVariables: Block should not be in AliveBlocks -# CHECK-NEXT: - function: testfloatslt -# CHECK-NEXT: - basic block: %bb.2 entry -# CHECK-NEXT: Virtual register %5 is not needed live through the block. -# CHECK-NEXT: LLVM ERROR: Found 3 machine code errors. + ; CHECK: bb.3.if.then3: + ; CHECK: successors: %bb.1(0x80000000) + + ; CHECK: %3:g8rc = nuw ADDI8 killed %1, 2 + ; CHECK: STD %3, 0, %4 :: (store 8 into %ir.p) + ; CHECK: %12:g8rc_and_g8rc_nox0 = COPY killed %3 + ; CHECK: B %bb.1 + +... diff --git a/llvm/test/CodeGen/PowerPC/load-two-flts.ll b/llvm/test/CodeGen/PowerPC/load-two-flts.ll index 1cfcff5e01601..19e21faf47232 100644 --- a/llvm/test/CodeGen/PowerPC/load-two-flts.ll +++ b/llvm/test/CodeGen/PowerPC/load-two-flts.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) { entry: diff --git a/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll b/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll index f4664788930d4..2cbb70bb14cb5 100644 --- a/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll +++ b/llvm/test/CodeGen/PowerPC/loop-data-prefetch-inner.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -enable-ppc-prefetching=true -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; Function Attrs: nounwind define void @foo(double* %x, double* nocapture readonly %y) #0 { diff --git a/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll b/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll index f4821564c202b..defc52eec8e0d 100644 --- a/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll +++ b/llvm/test/CodeGen/PowerPC/loop-data-prefetch.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -enable-ppc-prefetching=true -mcpu=a2 < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" ; Function Attrs: nounwind define void @foo(double* nocapture %a, double* nocapture readonly %b) #0 { diff --git a/llvm/test/CodeGen/PowerPC/loop-prep-all.ll b/llvm/test/CodeGen/PowerPC/loop-prep-all.ll index a13192d3e6586..7fdabcd4be210 100644 --- a/llvm/test/CodeGen/PowerPC/loop-prep-all.ll +++ b/llvm/test/CodeGen/PowerPC/loop-prep-all.ll @@ -1,5 +1,4 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BGQ target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -21,7 +20,6 @@ for.body: ; preds = %for.body, %entry ; CHECK-LABEL: @foo -; CHECK-BGQ-DAG: dcbt 4, 5 ; CHECK-DAG: lfdu [[REG1:[0-9]+]], 8({{[0-9]+}}) ; CHECK-DAG: fadd [[REG2:[0-9]+]], [[REG1]], 0 ; CHECK-DAG: stfdu [[REG2]], 8({{[0-9]+}}) @@ -34,15 +32,13 @@ for.cond.cleanup6: ; preds = %for.body7 for.body7: ; preds = %for.body, %for.body7 %i3.017 = phi i32 [ %inc9, %for.body7 ], [ 0, %for.body ] - tail call void bitcast (void (...)* @bar to void ()*)() #2 + tail call void bitcast (void (...)* @bar to void ()*)() #0 %inc9 = add nuw nsw i32 %i3.017, 1 %exitcond = icmp eq i32 %inc9, 1024 br i1 %exitcond, label %for.cond.cleanup6, label %for.body7 } -declare void @bar(...) #1 +declare void @bar(...) -attributes #0 = { nounwind "target-cpu"="a2q" } -attributes #1 = { "target-cpu"="a2q" } -attributes #2 = { nounwind } +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll index 93868007d0d36..aa618d2b732c7 100644 --- a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll +++ b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll @@ -41,6 +41,6 @@ define void @aligned_slot() #0 { ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 -attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/machine-combiner.ll b/llvm/test/CodeGen/PowerPC/machine-combiner.ll index 2e834b1fe788c..16fc3ee3e5202 100644 --- a/llvm/test/CodeGen/PowerPC/machine-combiner.ll +++ b/llvm/test/CodeGen/PowerPC/machine-combiner.ll @@ -1,5 +1,4 @@ ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr7 < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-PWR -; RUN: llc -verify-machineinstrs -O3 -mcpu=a2q < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-QPX ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 < %s | FileCheck %s -check-prefix=FIXPOINT target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -93,9 +92,6 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds1: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -110,9 +106,6 @@ define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds2: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -127,9 +120,6 @@ define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds3: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -144,9 +134,6 @@ define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, < define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { ; CHECK-LABEL: vector_reassociate_adds4: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds [[REG0:[0-9]+]], 1, 2 -; CHECK-QPX: qvfadds [[REG1:[0-9]+]], 3, 4 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 ; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] @@ -217,9 +204,6 @@ define i64 @reassociate_mulld(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { define double @reassociate_mamaa_double(double %0, double %1, double %2, double %3, double %4, double %5) { ; CHECK-LABEL: reassociate_mamaa_double: ; CHECK: # %bb.0: -; CHECK-QPX-DAG: fmadd [[REG0:[0-9]+]], 4, 3, 2 -; CHECK-QPX-DAG: fmadd [[REG1:[0-9]+]], 6, 5, 1 -; CHECK-QPX: fadd 1, [[REG0]], [[REG1]] ; CHECK-PWR-DAG: xsmaddadp 1, 6, 5 ; CHECK-PWR-DAG: xsmaddadp 2, 4, 3 ; CHECK-PWR: xsadddp 1, 2, 1 @@ -250,9 +234,6 @@ define float @reassociate_mamaa_float(float %0, float %1, float %2, float %3, fl define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5) { ; CHECK-LABEL: reassociate_mamaa_vec: ; CHECK: # %bb.0: -; CHECK-QPX-DAG: qvfmadds [[REG0:[0-9]+]], 4, 3, 2 -; CHECK-QPX-DAG: qvfmadds [[REG1:[0-9]+]], 6, 5, 1 -; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] ; CHECK-PWR-DAG: xvmaddasp [[REG0:[0-9]+]], 39, 38 ; CHECK-PWR-DAG: xvmaddasp [[REG1:[0-9]+]], 37, 36 ; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] @@ -268,11 +249,6 @@ define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x f define double @reassociate_mamama_double(double %0, double %1, double %2, double %3, double %4, double %5, double %6, double %7, double %8) { ; CHECK-LABEL: reassociate_mamama_double: ; CHECK: # %bb.0: -; CHECK-QPX: fmadd [[REG0:[0-9]+]], 2, 1, 7 -; CHECK-QPX-DAG: fmul [[REG1:[0-9]+]], 4, 3 -; CHECK-QPX-DAG: fmadd [[REG2:[0-9]+]], 6, 5, [[REG0]] -; CHECK-QPX-DAG: fmadd [[REG3:[0-9]+]], 9, 8, [[REG1]] -; CHECK-QPX: fadd 1, [[REG2]], [[REG3]] ; CHECK-PWR: xsmaddadp 7, 2, 1 ; CHECK-PWR-DAG: xsmuldp [[REG0:[0-9]+]], 4, 3 ; CHECK-PWR-DAG: xsmaddadp 7, 6, 5 diff --git a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll index e135986a2894c..f807f4fa20d25 100644 --- a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll +++ b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll @@ -19,7 +19,7 @@ entry: declare void @bar(double) #1 -attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/mcount-insertion.ll b/llvm/test/CodeGen/PowerPC/mcount-insertion.ll index cbb7947be2198..502347a3af198 100644 --- a/llvm/test/CodeGen/PowerPC/mcount-insertion.ll +++ b/llvm/test/CodeGen/PowerPC/mcount-insertion.ll @@ -1,9 +1,8 @@ -; RUN: opt -ee-instrument < %s | opt -inline | llc | FileCheck %s +; RUN: opt -ee-instrument < %s | opt -inline | llc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s ; The run-line mimics how Clang might run the instrumentation passes. target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" define void @leaf_function() #0 { diff --git a/llvm/test/CodeGen/PowerPC/memcpy-vec.ll b/llvm/test/CodeGen/PowerPC/memcpy-vec.ll index cd0abd6149bde..c4e60f8c4b1f5 100644 --- a/llvm/test/CodeGen/PowerPC/memcpy-vec.ll +++ b/llvm/test/CodeGen/PowerPC/memcpy-vec.ll @@ -1,6 +1,5 @@ ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR7 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR8 -; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s -check-prefix=A2Q target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -25,12 +24,6 @@ entry: ; PWR8: lxvw4x ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @foo1 -; A2Q-NOT: bl memcpy -; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4) -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -52,12 +45,6 @@ entry: ; PWR8: lxvw4x ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @foo2 -; A2Q-NOT: bl memcpy -; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4) -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -76,11 +63,6 @@ entry: ; PWR8-NOT: bl memset ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @bar1 -; A2Q-NOT: bl memset -; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3) -; A2Q: blr } ; Function Attrs: nounwind @@ -99,11 +81,6 @@ entry: ; PWR8-NOT: bl memset ; PWR8: stxvw4x ; PWR8: blr - -; A2Q-LABEL: @bar2 -; A2Q-NOT: bl memset -; A2Q: qvstfdx -; A2Q: blr } ; Function Attrs: nounwind diff --git a/llvm/test/CodeGen/PowerPC/memset-nc.ll b/llvm/test/CodeGen/PowerPC/memset-nc.ll deleted file mode 100644 index 663d0cb1d6785..0000000000000 --- a/llvm/test/CodeGen/PowerPC/memset-nc.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -O0 < %s | FileCheck %s -check-prefix=CHECK-O0 -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -; Function Attrs: nounwind -define void @test_qpx() unnamed_addr #0 align 2 { -entry: - %0 = load i32, i32* undef, align 4 - %1 = trunc i32 %0 to i8 - call void @llvm.memset.p0i8.i64(i8* align 32 null, i8 %1, i64 64, i1 false) - ret void - -; CHECK-LABEL: @test_qpx -; CHECK: qvstfdx -; CHECK: qvstfdx -; CHECK: blr - -; CHECK-O0-LABEL: @test_qpx -; CHECK-O0-NOT: qvstfdx -; CHECK-O0: blr -} - -; Function Attrs: nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1 - -; Function Attrs: nounwind -define void @test_vsx() unnamed_addr #2 align 2 { -entry: - %0 = load i32, i32* undef, align 4 - %1 = trunc i32 %0 to i8 - call void @llvm.memset.p0i8.i64(i8* null, i8 %1, i64 32, i1 false) - ret void - -; CHECK-LABEL: @test_vsx -; CHECK: stxvw4x -; CHECK: stxvw4x -; CHECK: blr - -; CHECK-O0-LABEL: @test_vsx -; CHECK-O0-NOT: stxvw4x -; CHECK-O0: blr -} - -attributes #0 = { nounwind "target-cpu"="a2q" } -attributes #1 = { nounwind } -attributes #2 = { nounwind "target-cpu"="pwr7" } - diff --git a/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll b/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll index 26663d81f3575..089c947713b9d 100644 --- a/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll +++ b/llvm/test/CodeGen/PowerPC/misched-inorder-latency.ll @@ -1,8 +1,7 @@ ; RUN: llc -verify-machineinstrs < %s -enable-misched -pre-RA-sched=source -scheditins=false \ -; RUN: -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s +; RUN: -disable-ifcvt-triangle-false -disable-post-ra -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s ; target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" ; %val1 is a load live out of %entry. It should be hoisted ; above the add. diff --git a/llvm/test/CodeGen/PowerPC/misched.ll b/llvm/test/CodeGen/PowerPC/misched.ll index 1c868b3f171c9..9a75fe44b7176 100644 --- a/llvm/test/CodeGen/PowerPC/misched.ll +++ b/llvm/test/CodeGen/PowerPC/misched.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s -enable-misched -verify-machineinstrs ; PR14302 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" @b = external global [16000 x double], align 32 diff --git a/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll b/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll index f59df4291c48f..ad5976318fe3a 100644 --- a/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll +++ b/llvm/test/CodeGen/PowerPC/optnone-crbits-i1-ret.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" declare zeroext i1 @ri1() declare void @se1() diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll index e8f77574f66cc..75c6d8c24038e 100644 --- a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll +++ b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll @@ -7,6 +7,9 @@ ; RUN: FileCheck %s ; This test case aims to test the vector multiply instructions on Power10. +; This includes the low order and high order versions of vector multiply. +; The low order version operates on doublewords, whereas the high order version +; operates on signed and unsigned words and doublewords. define <2 x i64> @test_vmulld(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_vmulld: @@ -17,3 +20,59 @@ entry: %mul = mul <2 x i64> %b, %a ret <2 x i64> %mul } + +define <2 x i64> @test_vmulhsd(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vmulhsd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhsd v2, v3, v2 +; CHECK-NEXT: blr +entry: + %0 = sext <2 x i64> %a to <2 x i128> + %1 = sext <2 x i64> %b to <2 x i128> + %mul = mul <2 x i128> %1, %0 + %shr = lshr <2 x i128> %mul, + %tr = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %tr +} + +define <2 x i64> @test_vmulhud(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_vmulhud: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhud v2, v3, v2 +; CHECK-NEXT: blr +entry: + %0 = zext <2 x i64> %a to <2 x i128> + %1 = zext <2 x i64> %b to <2 x i128> + %mul = mul <2 x i128> %1, %0 + %shr = lshr <2 x i128> %mul, + %tr = trunc <2 x i128> %shr to <2 x i64> + ret <2 x i64> %tr +} + +define <4 x i32> @test_vmulhsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmulhsw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhsw v2, v3, v2 +; CHECK-NEXT: blr +entry: + %0 = sext <4 x i32> %a to <4 x i64> + %1 = sext <4 x i32> %b to <4 x i64> + %mul = mul <4 x i64> %1, %0 + %shr = lshr <4 x i64> %mul, + %tr = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %tr +} + +define <4 x i32> @test_vmulhuw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmulhuw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmulhuw v2, v3, v2 +; CHECK-NEXT: blr +entry: + %0 = zext <4 x i32> %a to <4 x i64> + %1 = zext <4 x i32> %b to <4 x i64> + %mul = mul <4 x i64> %1, %0 + %shr = lshr <4 x i64> %mul, + %tr = trunc <4 x i64> %shr to <4 x i32> + ret <4 x i32> %tr +} diff --git a/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll b/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll index 2e248506c7b7b..2871e077df565 100644 --- a/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-local-caller-toc.ll @@ -92,7 +92,7 @@ entry: ; Left the target features in this test because it is important that caller has ; -pcrelative-memops while callee has +pcrelative-memops -attributes #0 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-qpx,-spe" } -attributes #1 = { "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-qpx,-spe" } -attributes #2 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+power9-vector,+vsx,-htm,-pcrelative-memops,-qpx,-spe" } +attributes #0 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-spe" } +attributes #1 = { "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+pcrelative-memops,+power8-vector,+power9-vector,+vsx,-htm,-spe" } +attributes #2 = { nounwind "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+power9-vector,+vsx,-htm,-pcrelative-memops,-spe" } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/phi-eliminate.mir b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir new file mode 100644 index 0000000000000..e74ca4ca0e5d9 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/phi-eliminate.mir @@ -0,0 +1,295 @@ +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 %s -o - \ +# RUN: -run-pass=livevars,phi-node-elimination | FileCheck %s + +--- | + define void @phi_eliminate(i32 %0, i32 %1, i8* %2) { + %scevgep3 = getelementptr i8, i8* %2, i64 undef + call void @llvm.set.loop.iterations.i64(i64 undef) + br label %4 + + 4: ; preds = %4, %3 + %5 = phi i32 [ %8, %4 ], [ %0, %3 ] + %6 = phi i8* [ %scevgep3, %3 ], [ %7, %4 ] + %7 = getelementptr i8, i8* %6, i64 -1 + %8 = sdiv i32 %5, %1 + %9 = mul nsw i32 %8, %1 + %10 = sub nsw i32 %5, %9 + %11 = icmp ult i32 %10, 10 + %12 = trunc i32 %10 to i8 + %13 = select i1 %11, i8 48, i8 55 + %14 = add i8 %13, %12 + store i8 %14, i8* %7, align 1 + %15 = call i1 @llvm.loop.decrement.i64(i64 1) + br i1 %15, label %4, label %16 + + 16: ; preds = %4 + ret void + } + + declare void @llvm.set.loop.iterations.i64(i64) + + declare i1 @llvm.loop.decrement.i64(i64) + + declare void @llvm.stackprotector(i8*, i8**) +... +--- +name: phi_eliminate +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: + - { id: 0, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 1, class: gprc, preferred-register: '' } + - { id: 2, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 3, class: g8rc, preferred-register: '' } + - { id: 4, class: gprc, preferred-register: '' } + - { id: 5, class: g8rc, preferred-register: '' } + - { id: 6, class: g8rc, preferred-register: '' } + - { id: 7, class: g8rc, preferred-register: '' } + - { id: 8, class: gprc, preferred-register: '' } + - { id: 9, class: gprc, preferred-register: '' } + - { id: 10, class: g8rc, preferred-register: '' } + - { id: 11, class: gprc, preferred-register: '' } + - { id: 12, class: gprc, preferred-register: '' } + - { id: 13, class: crrc, preferred-register: '' } + - { id: 14, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 15, class: gprc_and_gprc_nor0, preferred-register: '' } + - { id: 16, class: gprc, preferred-register: '' } + - { id: 17, class: gprc, preferred-register: '' } + - { id: 18, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 19, class: gprc, preferred-register: '' } + - { id: 20, class: gprc, preferred-register: '' } + - { id: 21, class: gprc, preferred-register: '' } + - { id: 22, class: crrc, preferred-register: '' } + - { id: 23, class: gprc, preferred-register: '' } + - { id: 24, class: gprc, preferred-register: '' } + - { id: 25, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 26, class: gprc, preferred-register: '' } + - { id: 27, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 28, class: gprc, preferred-register: '' } + - { id: 29, class: gprc, preferred-register: '' } + - { id: 30, class: gprc, preferred-register: '' } + - { id: 31, class: crrc, preferred-register: '' } + - { id: 32, class: gprc, preferred-register: '' } + - { id: 33, class: gprc, preferred-register: '' } + - { id: 34, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 35, class: gprc, preferred-register: '' } + - { id: 36, class: gprc, preferred-register: '' } + - { id: 37, class: gprc, preferred-register: '' } + - { id: 38, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 39, class: g8rc, preferred-register: '' } + - { id: 40, class: gprc, preferred-register: '' } + - { id: 41, class: gprc, preferred-register: '' } + - { id: 42, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 43, class: gprc, preferred-register: '' } + - { id: 44, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 45, class: gprc, preferred-register: '' } + - { id: 46, class: gprc, preferred-register: '' } + - { id: 47, class: crrc, preferred-register: '' } + - { id: 48, class: gprc, preferred-register: '' } + - { id: 49, class: gprc, preferred-register: '' } + - { id: 50, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 51, class: g8rc, preferred-register: '' } + - { id: 52, class: gprc, preferred-register: '' } + - { id: 53, class: gprc, preferred-register: '' } + - { id: 54, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 55, class: gprc, preferred-register: '' } + - { id: 56, class: gprc, preferred-register: '' } +liveins: + - { reg: '$x3', virtual-reg: '%5' } + - { reg: '$x4', virtual-reg: '%6' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: phi_eliminate + ; CHECK: bb.0 (%ir-block.3): + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $x3, $x4 + ; CHECK: %6:g8rc = COPY killed $x4 + ; CHECK: %5:g8rc = COPY killed $x3 + ; CHECK: %9:gprc = COPY killed %6.sub_32 + ; CHECK: %8:gprc = COPY killed %5.sub_32 + ; CHECK: MTCTR8loop undef %10:g8rc, implicit-def dead $ctr8 + ; CHECK: %14:gprc_and_gprc_nor0 = LI 55 + ; CHECK: %15:gprc_and_gprc_nor0 = LI 48 + + ; CHECK: bb.1 (%ir-block.4): + ; CHECK: successors: %bb.2(0x40000000), %bb.7(0x40000000) + ; CHECK: %19:gprc = DIVW %8, %9 + ; CHECK: BDNZ8 %bb.2, implicit-def $ctr8, implicit $ctr8 + + ; CHECK: bb.7: + ; CHECK: successors: %bb.5(0x80000000) + ; CHECK: %61:gprc = COPY killed %8 + ; CHECK: %62:g8rc_and_g8rc_nox0 = IMPLICIT_DEF + ; CHECK: %63:gprc = COPY killed %19 + ; CHECK: B %bb.5 + + ; CHECK: bb.2 (%ir-block.4): + ; CHECK: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK: %20:gprc = nsw MULLW %19, %9 + ; CHECK: %21:gprc = SUBF killed %20, killed %8 + ; CHECK: %22:crrc = CMPLWI %21, 10 + ; CHECK: %23:gprc = ISEL %15, %14, killed %22.sub_lt + ; CEHCK: %24:gprc = ADD4 killed %23, killed %21 + ; CHECK: %25:g8rc_and_g8rc_nox0 = STBU killed %24, -1, undef %0:g8rc_and_g8rc_nox0 :: (store 1 into %ir.7) + ; CHECK: %26:gprc = DIVW %19, %9 + ; CHECK: %57:gprc = COPY killed %26 + ; CHECK: %58:gprc = COPY %19 + ; CHECK: %59:g8rc_and_g8rc_nox0 = COPY killed %25 + ; CHECK: %60:gprc = COPY killed %19 + ; CHECK: BDZ8 %bb.4, implicit-def $ctr8, implicit $ctr8 + ; CHECK: B %bb.3 + + ; CHECK: bb.3 (%ir-block.4): + ; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000) + ; CHECK: %38:g8rc_and_g8rc_nox0 = COPY killed %59 + ; CHECK: %37:gprc = COPY %57 + ; CHECK: %36:gprc = COPY killed %58 + ; CHECK: %35:gprc = COPY killed %57 + ; CHECK: %27:g8rc_and_g8rc_nox0 = COPY killed %38 + ; CHECK: %56:gprc = COPY %35 + ; CHECK: %28:gprc = DIVW %56, %9 + ; CHECK: %29:gprc = nsw MULLW killed %37, %9 + ; CHECK: %30:gprc = SUBF killed %29, killed %36 + ; CHECK: %31:crrc = CMPLWI %30, 10 + ; CHECK: %32:gprc = ISEL %15, %14, killed %31.sub_lt + ; CHECK: %33:gprc = ADD4 killed %32, killed %30 + ; CHECK: %34:g8rc_and_g8rc_nox0 = STBU killed %33, -1, killed %27 :: (store unknown-size into %ir.7, align 1) + ; CHECK: %57:gprc = COPY killed %28 + ; CHECK: %58:gprc = COPY killed %35 + ; CHECK: %59:g8rc_and_g8rc_nox0 = COPY killed %34 + ; CHECK: %60:gprc = COPY killed %56 + ; CHECK: BDNZ8 %bb.3, implicit-def $ctr8, implicit $ctr8 + ; CHECK: B %bb.4 + + ; CHECK: bb.4: + ; CHECK: successors: %bb.5(0x80000000) + ; CHECK: %44:g8rc_and_g8rc_nox0 = COPY killed %59 + ; CHECK: %43:gprc = COPY killed %57 + ; CHECK: %41:gprc = COPY killed %60 + ; CHECK: %39:g8rc = COPY killed %44 + ; CHECK: %61:gprc = COPY killed %41 + ; CHECK: %62:g8rc_and_g8rc_nox0 = COPY killed %39 + ; CHECK: %63:gprc = COPY killed %43 + + ; CHECK: bb.5: + ; CHECK: successors: %bb.6(0x80000000) + ; CHECK: %55:gprc = COPY killed %63 + ; CHECK: %54:g8rc_and_g8rc_nox0 = COPY killed %62 + ; CHECK: %53:gprc = COPY killed %61 + ; CHECK: %45:gprc = nsw MULLW killed %55, killed %9 + ; CHECK: %46:gprc = SUBF killed %45, killed %53 + ; CHECK: %47:crrc = CMPLWI %46, 10 + ; CHECK: %48:gprc = ISEL killed %15, killed %14, killed %47.sub_lt + ; CHECK: %49:gprc = ADD4 killed %48, killed %46 + ; CHECK: dead %50:g8rc_and_g8rc_nox0 = STBU killed %49, -1, killed %54 :: (store unknown-size into %ir.7, align 1) + ; CHECK: B %bb.6 + + ; CHECK: bb.6 (%ir-block.16): + ; CHECK: BLR8 implicit $lr8, implicit $rm + + bb.0 (%ir-block.3): + successors: %bb.1(0x80000000) + liveins: $x3, $x4 + + %6:g8rc = COPY killed $x4 + %5:g8rc = COPY killed $x3 + %9:gprc = COPY killed %6.sub_32 + %8:gprc = COPY killed %5.sub_32 + MTCTR8loop undef %10:g8rc, implicit-def dead $ctr8 + %14:gprc_and_gprc_nor0 = LI 55 + %15:gprc_and_gprc_nor0 = LI 48 + + bb.1 (%ir-block.4): + successors: %bb.2(0x40000000), %bb.5(0x40000000) + + %19:gprc = DIVW %8, %9 + BDZ8 %bb.5, implicit-def $ctr8, implicit $ctr8 + B %bb.2 + + bb.2 (%ir-block.4): + successors: %bb.3(0x40000000), %bb.4(0x40000000) + + %20:gprc = nsw MULLW %19, %9 + %21:gprc = SUBF killed %20, killed %8 + %22:crrc = CMPLWI %21, 10 + %23:gprc = ISEL %15, %14, killed %22.sub_lt + %24:gprc = ADD4 killed %23, killed %21 + %25:g8rc_and_g8rc_nox0 = STBU killed %24, -1, undef %0:g8rc_and_g8rc_nox0 :: (store 1 into %ir.7) + %26:gprc = DIVW %19, %9 + BDZ8 %bb.4, implicit-def $ctr8, implicit $ctr8 + B %bb.3 + + bb.3 (%ir-block.4): + successors: %bb.3(0x7c000000), %bb.4(0x04000000) + + %35:gprc = PHI %26, %bb.2, %28, %bb.3 + %36:gprc = PHI %19, %bb.2, %35, %bb.3 + %37:gprc = PHI %26, %bb.2, %28, %bb.3 + %38:g8rc_and_g8rc_nox0 = PHI %25, %bb.2, %34, %bb.3 + %27:g8rc_and_g8rc_nox0 = COPY killed %38 + %56:gprc = COPY %35 + %28:gprc = DIVW %56, %9 + %29:gprc = nsw MULLW killed %37, %9 + %30:gprc = SUBF killed %29, killed %36 + %31:crrc = CMPLWI %30, 10 + %32:gprc = ISEL %15, %14, killed %31.sub_lt + %33:gprc = ADD4 killed %32, killed %30 + %34:g8rc_and_g8rc_nox0 = STBU killed %33, -1, killed %27 :: (store unknown-size into %ir.7, align 1) + BDNZ8 %bb.3, implicit-def $ctr8, implicit $ctr8 + B %bb.4 + + bb.4: + successors: %bb.5(0x80000000) + + %41:gprc = PHI %19, %bb.2, %56, %bb.3 + %43:gprc = PHI %26, %bb.2, %28, %bb.3 + %44:g8rc_and_g8rc_nox0 = PHI %25, %bb.2, %34, %bb.3 + %39:g8rc = COPY killed %44 + + bb.5: + successors: %bb.6(0x80000000) + + %53:gprc = PHI %8, %bb.1, %41, %bb.4 + %54:g8rc_and_g8rc_nox0 = PHI undef %0:g8rc_and_g8rc_nox0, %bb.1, %39, %bb.4 + %55:gprc = PHI %19, %bb.1, %43, %bb.4 + %45:gprc = nsw MULLW killed %55, killed %9 + %46:gprc = SUBF killed %45, killed %53 + %47:crrc = CMPLWI %46, 10 + %48:gprc = ISEL killed %15, killed %14, killed %47.sub_lt + %49:gprc = ADD4 killed %48, killed %46 + dead %50:g8rc_and_g8rc_nox0 = STBU killed %49, -1, killed %54 :: (store unknown-size into %ir.7, align 1) + B %bb.6 + + bb.6 (%ir-block.16): + BLR8 implicit $lr8, implicit $rm + +... diff --git a/llvm/test/CodeGen/PowerPC/popcnt.ll b/llvm/test/CodeGen/PowerPC/popcnt.ll index a06c59d4b945a..695863d87f16e 100644 --- a/llvm/test/CodeGen/PowerPC/popcnt.ll +++ b/llvm/test/CodeGen/PowerPC/popcnt.ll @@ -1,8 +1,6 @@ ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mattr=+popcntd < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mattr=+slow-popcntd < %s | FileCheck %s --check-prefix=SLOWPC ; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=a2q < %s | FileCheck %s --check-prefix=SLOWPC -; RUN: llc -verify-machineinstrs -mtriple=ppc64-- -mcpu=a2q -mattr=+popcntd < %s | FileCheck %s define i64 @_cntb64(i64 %x) nounwind readnone { %cnt = tail call i64 @llvm.ppc.popcntb(i64 %x) diff --git a/llvm/test/CodeGen/PowerPC/ppc-passname.ll b/llvm/test/CodeGen/PowerPC/ppc-passname.ll index 98343bdb535c2..06f13278d84cd 100644 --- a/llvm/test/CodeGen/PowerPC/ppc-passname.ll +++ b/llvm/test/CodeGen/PowerPC/ppc-passname.ll @@ -105,14 +105,3 @@ ; STOP-AFTER-BRANCH-COALESCING-NOT: "ppc-branch-coalescing" pass is not registered. ; STOP-AFTER-BRANCH-COALESCING: Branch Coalescing - -; Test pass name: ppc-qpx-load-splat. -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-qpx-load-splat -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-QPX-LOAD-SPLAT -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: -ppc-qpx-load-splat -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: "ppc-qpx-load-splat" pass is not registered. -; STOP-BEFORE-QPX-LOAD-SPLAT-NOT: PowerPC QPX Load Splat Simplification - -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-qpx-load-splat -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-QPX-LOAD-SPLAT -; STOP-AFTER-QPX-LOAD-SPLAT: -ppc-qpx-load-splat -; STOP-AFTER-QPX-LOAD-SPLAT-NOT: "ppc-qpx-load-splat" pass is not registered. -; STOP-AFTER-QPX-LOAD-SPLAT: PowerPC QPX Load Splat Simplification diff --git a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll index fc0e71f878cab..357f28e88b184 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO -; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX -; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX +; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO +; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO ; RUN: llc < %s -relocation-model=static -O1 -disable-ppc-sco=false -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -code-model=small | FileCheck %s -check-prefix=SCM ; No combination of "powerpc64le-unknown-linux-gnu" + "CHECK-SCO", because @@ -117,23 +117,6 @@ define void @caller_local_sret_32(%S_32* %a) #1 { attributes #0 = { noinline nounwind } attributes #1 = { nounwind } -; vector <4 x i1> test - -define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void } -define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) { - tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b) - ret void - -; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't -; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder - -; CHECK-SCO-LABEL: caller_v4i1_reorder: -; CHECK-SCO: bl callee_v4i1 - -; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder: -; CHECK-SCO-HASQPX: b callee_v4i1 -} - define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void } define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) diff --git a/llvm/test/CodeGen/PowerPC/pr24546.ll b/llvm/test/CodeGen/PowerPC/pr24546.ll index 28c03293680e5..028fd2d8f0064 100644 --- a/llvm/test/CodeGen/PowerPC/pr24546.ll +++ b/llvm/test/CodeGen/PowerPC/pr24546.ll @@ -47,8 +47,8 @@ declare double @pow(double, double) #0 ; Function Attrs: nounwind readnone declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/pr27350.ll b/llvm/test/CodeGen/PowerPC/pr27350.ll index 982023a1fcdc8..93dbd10fecdeb 100644 --- a/llvm/test/CodeGen/PowerPC/pr27350.ll +++ b/llvm/test/CodeGen/PowerPC/pr27350.ll @@ -18,7 +18,7 @@ entry: declare fastcc void @bar([2 x i64], [2 x i64]) unnamed_addr #1 align 2 attributes #0 = { argmemonly nounwind } -attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/pr28130.ll b/llvm/test/CodeGen/PowerPC/pr28130.ll index cb703dfda8a59..4da415bd29269 100644 --- a/llvm/test/CodeGen/PowerPC/pr28130.ll +++ b/llvm/test/CodeGen/PowerPC/pr28130.ll @@ -67,4 +67,4 @@ bb: ret void } -attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll new file mode 100644 index 0000000000000..79e6026365ba7 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll @@ -0,0 +1,97 @@ +; RUN: llc -verify-machineinstrs -disable-ppc-instr-form-prep=true -mcpu=pwr9 < %s \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck %s -check-prefix=CHECK-P9 +; RUN: llc -verify-machineinstrs -disable-ppc-instr-form-prep=true -mcpu=pwr10 < %s \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck %s -check-prefix=CHECK-P10 + +target triple = "powerpc64le-unknown-linux-gnu" + +%_elem_type_of_a = type <{ double }> +%_elem_type_of_x = type <{ double }> +%_elem_type_of_y = type <{ double }> + +define void @test(i32* dereferenceable(4) %.ial, i32* noalias dereferenceable(4) %.m, i32* noalias dereferenceable(4) %.n, [0 x %_elem_type_of_a]* %.a, i32* noalias dereferenceable(4) %.lda, [0 x %_elem_type_of_x]* noalias %.x, [0 x %_elem_type_of_y]* noalias %.y) { +; CHECK-P9-LABEL: test: +; CHECK-P9: .LBB0_2: # %_loop_2_do_ +; CHECK-P9: lxv vs1, -16(r4) +; CHECK-P9: lxv vs2, 0(r4) +; CHECK-P9-DAG: lxv vs3, -16(r3) +; CHECK-P9-DAG: lxv vs4, 0(r3) +; CHECK-P9-DAG: xvmaddadp vs1, vs3, vs1 +; CHECK-P9-DAG: stxv vs1, -16(r4) +; CHECK-P9-DAG: xvmaddadp vs2, vs4, vs0 +; CHECK-P9: stxv vs2, 0(r4) +; CHECK-P9: bdnz .LBB0_2 +; +; FIXME: use pair load/store instructions lxvp/stxvp +; CHECK-P10-LABEL: test: +; CHECK-P10: .LBB0_2: # %_loop_2_do_ +; CHECK-P10: lxv vs1, -16(r4) +; CHECK-P10: lxv vs2, 0(r4) +; CHECK-P10-DAG: lxv vs3, -16(r3) +; CHECK-P10-DAG: lxv vs4, 0(r3) +; CHECK-P10-DAG: xvmaddadp vs1, vs3, vs1 +; CHECK-P10-DAG: xvmaddadp vs2, vs4, vs0 +; CHECK-P10-DAG: stxv vs1, -16(r4) +; CHECK-P10: stxv vs2, 0(r4) +; CHECK-P10: bdnz .LBB0_2 +test_entry: + %_conv5 = ptrtoint [0 x %_elem_type_of_a]* %.a to i64 + %_andi_tmp = and i64 %_conv5, 15 + %_equ_tmp = icmp eq i64 %_andi_tmp, 0 + %. = select i1 %_equ_tmp, i32 1, i32 2 + %_val_m_ = load i32, i32* %.m, align 4 + %_sub_tmp9 = sub nsw i32 1, %. + %_add_tmp10 = add i32 %_sub_tmp9, %_val_m_ + %_mod_tmp = srem i32 %_add_tmp10, 16 + %_sub_tmp11 = sub i32 %_val_m_, %_mod_tmp + %_val_n_ = load i32, i32* %.n, align 4 + %x_rvo_based_addr_17 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1 + %_div_tmp = sdiv i32 %_val_n_, 2 + %_conv16 = sext i32 %_div_tmp to i64 + %_ind_cast = getelementptr inbounds %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_17, i64 %_conv16, i32 0 + %_val_x_ = load double, double* %_ind_cast, align 8 + %.splatinsert = insertelement <2 x double> undef, double %_val_x_, i32 0 + %.splat = shufflevector <2 x double> %.splatinsert, <2 x double> undef, <2 x i32> zeroinitializer + %_grt_tmp21 = icmp sgt i32 %., %_sub_tmp11 + br i1 %_grt_tmp21, label %_return_bb, label %_loop_2_do_.lr.ph + +_loop_2_do_.lr.ph: ; preds = %test_entry + %_val_lda_ = load i32, i32* %.lda, align 4 + %_conv = sext i32 %_val_lda_ to i64 + %_mult_tmp = shl nsw i64 %_conv, 3 + %_sub_tmp4 = sub nuw nsw i64 -8, %_mult_tmp + %y_rvo_based_addr_19 = getelementptr inbounds [0 x %_elem_type_of_y], [0 x %_elem_type_of_y]* %.y, i64 0, i64 -1 + %a_byte_ptr_ = bitcast [0 x %_elem_type_of_a]* %.a to i8* + %a_rvo_based_addr_ = getelementptr inbounds i8, i8* %a_byte_ptr_, i64 %_sub_tmp4 + %0 = zext i32 %. to i64 + %1 = sext i32 %_sub_tmp11 to i64 + br label %_loop_2_do_ + +_loop_2_do_: ; preds = %_loop_2_do_.lr.ph, %_loop_2_do_ + %indvars.iv = phi i64 [ %0, %_loop_2_do_.lr.ph ], [ %indvars.iv.next, %_loop_2_do_ ] + %_ix_x_len19 = shl nuw nsw i64 %indvars.iv, 3 + %y_ix_dim_0_20 = getelementptr inbounds %_elem_type_of_y, %_elem_type_of_y* %y_rvo_based_addr_19, i64 %indvars.iv + %2 = bitcast %_elem_type_of_y* %y_ix_dim_0_20 to <2 x double>* + %3 = load <2 x double>, <2 x double>* %2, align 1 + %4 = getelementptr %_elem_type_of_y, %_elem_type_of_y* %y_ix_dim_0_20, i64 2 + %5 = bitcast %_elem_type_of_y* %4 to <2 x double>* + %6 = load <2 x double>, <2 x double>* %5, align 1 + %a_ix_dim_1_ = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len19 + %7 = bitcast i8* %a_ix_dim_1_ to <2 x double>* + %8 = load <2 x double>, <2 x double>* %7, align 1 + %9 = getelementptr i8, i8* %a_ix_dim_1_, i64 16 + %10 = bitcast i8* %9 to <2 x double>* + %11 = load <2 x double>, <2 x double>* %10, align 1 + %12 = tail call nsz contract <2 x double> @llvm.fma.v2f64(<2 x double> %8, <2 x double> %3, <2 x double> %3) + %13 = tail call nsz contract <2 x double> @llvm.fma.v2f64(<2 x double> %11, <2 x double> %.splat, <2 x double> %6) + store <2 x double> %12, <2 x double>* %2, align 1 + store <2 x double> %13, <2 x double>* %5, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 + %_grt_tmp = icmp sgt i64 %indvars.iv.next, %1 + br i1 %_grt_tmp, label %_return_bb, label %_loop_2_do_ + +_return_bb: ; preds = %_loop_2_do_, %test_entry + ret void +} + +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll b/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll index 04dee1ee182bb..35aec57ec2640 100644 --- a/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll +++ b/llvm/test/CodeGen/PowerPC/preinc-ld-sel-crash.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" +target triple = "powerpc64le-unknown-linux" %t1 = type { %t2*, %t3* } %t2 = type <{ %t3*, i32, [4 x i8] }> diff --git a/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll b/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll deleted file mode 100644 index 4e0aef4c3df71..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-bv-sint.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define void @s452(i32 %inp1) nounwind { -entry: - br label %for.body4 - -for.body4: ; preds = %for.body4, %entry - %conv.4 = sitofp i32 %inp1 to double - %conv.5 = sitofp i32 %inp1 to double - %mul.4.v.i0.1 = insertelement <2 x double> undef, double %conv.4, i32 0 - %v = insertelement <2 x double> %mul.4.v.i0.1, double %conv.5, i32 1 - %vv = fmul <2 x double> %v, %v - %add7.4 = fadd <2 x double> %vv, %vv - store <2 x double> %add7.4, <2 x double>* undef, align 16 - br i1 undef, label %for.end, label %for.body4 - -for.end: ; preds = %for.body4 - unreachable -; CHECK-LABEL: @s452 -; CHECK: lfiwax [[REG1:[0-9]+]], -; CHECK: fcfid [[REG2:[0-9]+]], [[REG1]] -; FIXME: We could 'promote' this to a vector earlier and remove this splat. -; CHECK: qvesplati {{[0-9]+}}, [[REG2]], 0 -; CHECK: qvfmul -; CHECK: qvfadd -; CHECK: qvesplati {{[0-9]+}}, -; FIXME: We can use qvstfcdx here instead of two stores. -; CHECK: stfd -; CHECK: stfd -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-bv.ll b/llvm/test/CodeGen/PowerPC/qpx-bv.ll deleted file mode 100644 index 93a739b864c1d..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-bv.ll +++ /dev/null @@ -1,37 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s - -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(double %f1, double %f2, double %f3, double %f4) { - %v1 = insertelement <4 x double> undef, double %f1, i32 0 - %v2 = insertelement <4 x double> %v1, double %f2, i32 1 - %v3 = insertelement <4 x double> %v2, double %f3, i32 2 - %v4 = insertelement <4 x double> %v3, double %f4, i32 3 - ret <4 x double> %v4 - -; CHECK-LABEL: @foo -; CHECK: qvgpci [[REG1:[0-9]+]], 275 -; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101 -; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]] -; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]] -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]] -; CHECK: blr -} - -define <4 x float> @goo(float %f1, float %f2, float %f3, float %f4) { - %v1 = insertelement <4 x float> undef, float %f1, i32 0 - %v2 = insertelement <4 x float> %v1, float %f2, i32 1 - %v3 = insertelement <4 x float> %v2, float %f3, i32 2 - %v4 = insertelement <4 x float> %v3, float %f4, i32 3 - ret <4 x float> %v4 - -; CHECK-LABEL: @goo -; CHECK: qvgpci [[REG1:[0-9]+]], 275 -; CHECK-DAG: qvgpci [[REG2:[0-9]+]], 101 -; CHECK-DAG: qvfperm [[REG3:[0-9]+]], 3, 4, [[REG1]] -; CHECK-DAG: qvfperm [[REG4:[0-9]+]], 1, 2, [[REG1]] -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG3]], [[REG2]] -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll b/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll deleted file mode 100644 index ccbbd162a0cdb..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-func-clobber.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -declare <4 x double> @foo(<4 x double> %p) - -define <4 x double> @bar(<4 x double> %p, <4 x double> %q) { -entry: - %v = call <4 x double> @foo(<4 x double> %p) - %w = call <4 x double> @foo(<4 x double> %q) - %x = fadd <4 x double> %v, %w - ret <4 x double> %x - -; CHECK-LABEL: @bar -; CHECK: qvstfdx 2, -; CHECK: bl foo -; CHECK: qvstfdx 1, -; CHECK: qvlfdx 1, -; CHECK: bl foo -; CHECK: qvlfdx [[REG:[0-9]+]], -; CHECK: qvfadd 1, [[REG]], 1 -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll b/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll deleted file mode 100644 index 50b864980d985..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-load-splat.ll +++ /dev/null @@ -1,80 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ -; RUN: -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s - -; Function Attrs: norecurse nounwind readonly -define <4 x double> @foo(double* nocapture readonly %a) #0 { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lxvdsx v2, 0, r3 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %0 = load double, double* %a, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - ret <4 x double> %shuffle.i -} - -define <4 x double> @foox(double* nocapture readonly %a, i64 %idx) #0 { -; CHECK-LABEL: foox: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: lxvdsx v2, r3, r4 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %p = getelementptr double, double* %a, i64 %idx - %0 = load double, double* %p, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - ret <4 x double> %shuffle.i -} - -define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pptr) #0 { -; CHECK-LABEL: fooxu: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: add r6, r3, r4 -; CHECK-NEXT: std r6, 0(r5) -; CHECK-NEXT: lxvdsx v2, r3, r4 -; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: blr -entry: - %p = getelementptr double, double* %a, i64 %idx - %0 = load double, double* %p, align 8 - %vecinit.i = insertelement <4 x double> undef, double %0, i32 0 - %shuffle.i = shufflevector <4 x double> %vecinit.i, <4 x double> undef, <4 x i32> zeroinitializer - store double* %p, double** %pptr, align 8 - ret <4 x double> %shuffle.i -} - -define <4 x float> @foof(float* nocapture readonly %a) #0 { -; CHECK-LABEL: foof: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxspltw v2, vs0, 1 -; CHECK-NEXT: blr -entry: - %0 = load float, float* %a, align 4 - %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 - %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %shuffle.i -} - -define <4 x float> @foofx(float* nocapture readonly %a, i64 %idx) #0 { -; CHECK-LABEL: foofx: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sldi r4, r4, 2 -; CHECK-NEXT: lfiwzx f0, r3, r4 -; CHECK-NEXT: xxspltw v2, vs0, 1 -; CHECK-NEXT: blr -entry: - %p = getelementptr float, float* %a, i64 %idx - %0 = load float, float* %p, align 4 - %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 - %shuffle.i = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %shuffle.i -} - - diff --git a/llvm/test/CodeGen/PowerPC/qpx-load.ll b/llvm/test/CodeGen/PowerPC/qpx-load.ll deleted file mode 100644 index 514f0934b6cfc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-load.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(<4 x double>* %p) { -entry: - %v = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %v -} - -; CHECK: @foo -; CHECK-DAG: li [[REG1:[0-9]+]], 31 -; CHECK-DAG: qvlfdx [[REG4:[0-9]+]], 0, 3 -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], 3, [[REG1]] -; CHECK-DAG: qvlpcldx [[REG3:[0-9]+]], 0, 3 -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]] -; CHECK: blr - -define <4 x double> @bar(<4 x double>* %p) { -entry: - %v = load <4 x double>, <4 x double>* %p, align 32 - ret <4 x double> %v -} - -; CHECK: @bar -; CHECK: qvlfdx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll b/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll deleted file mode 100644 index eab4d6af7e9fc..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-qvfmadd.ll +++ /dev/null @@ -1,79 +0,0 @@ -; RUN: llc -verify-machineinstrs -stop-after=finalize-isel < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <2 x double> @test_qvfmadd(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfmadd -; CHECK: QVFMADD %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fadd reassoc nsz <2 x double> %4, %0 - ret <2 x double> %5 -} - -define <4 x float> @test_qvfmadds(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfmadds -; CHECK: QVFMADDSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fadd reassoc nsz <4 x float> %4, %0 - ret <4 x float> %5 -} - -define <2 x double> @test_qvfnmadd(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfnmadd -; CHECK: QVFNMADD %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fadd reassoc nsz <2 x double> %4, %0 - %6 = fneg reassoc nsz <2 x double> %5 - ret <2 x double> %6 -} - -define <4 x float> @test_qvfnmadds(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfnmadds -; CHECK: QVFNMADDSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fadd reassoc nsz <4 x float> %4, %0 - %6 = fneg reassoc nsz <4 x float> %5 - ret <4 x float> %6 -} - -define <2 x double> @test_qvfmsub(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfmsub -; CHECK: QVFMSUB %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fsub reassoc nsz <2 x double> %4, %0 - ret <2 x double> %5 -} - -define <4 x float> @test_qvfmsubs(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfmsubs -; CHECK: QVFMSUBSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fsub reassoc nsz <4 x float> %4, %0 - ret <4 x float> %5 -} - -define <2 x double> @test_qvfnmsub(<2 x double> %0, <2 x double> %1, <2 x double> %2) { -; CHECK: test_qvfnmsub -; CHECK: QVFNMSUB %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <2 x double> %2, %1 - %5 = fsub reassoc nsz <2 x double> %4, %0 - %6 = fneg reassoc nsz <2 x double> %5 - ret <2 x double> %6 -} - -define <4 x float> @test_qvfnmsubs(<4 x float> %0, <4 x float> %1, <4 x float> %2) { -; CHECK: test_qvfnmsubs -; CHECK: QVFNMSUBSs %2, %1, %0, implicit $rm -; - %4 = fmul reassoc nsz <4 x float> %2, %1 - %5 = fsub reassoc nsz <4 x float> %4, %0 - %6 = fneg reassoc nsz <4 x float> %5 - ret <4 x float> %6 -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll deleted file mode 100644 index 498ab62819ced..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll +++ /dev/null @@ -1,473 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) -declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) - -define <4 x double> @foo_fmf(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha -; CHECK-NEXT: qvfrsqrte 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfmsub 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsub 4, 2, 4, 0 -; CHECK-NEXT: qvfmul 3, 3, 4 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfnmsub 0, 2, 4, 0 -; CHECK-NEXT: qvfmul 0, 3, 0 -; CHECK-NEXT: qvfmul 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call ninf afn reassoc <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %r = fdiv arcp reassoc <4 x double> %a, %x - ret <4 x double> %r -} - -define <4 x double> @foo_safe(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 5, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 1 -; CHECK-NEXT: qvesplati 4, 2, 2 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 5, 5 -; CHECK-NEXT: fsqrt 4, 4 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: qvesplati 6, 1, 3 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdiv 2, 1, 2 -; CHECK-NEXT: fdiv 5, 6, 5 -; CHECK-NEXT: qvesplati 6, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdiv 4, 6, 4 -; CHECK-NEXT: fdiv 1, 1, 3 -; CHECK-NEXT: qvfperm 3, 4, 5, 0 -; CHECK-NEXT: qvfperm 0, 2, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 3, 1 -; CHECK-NEXT: blr -entry: - %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %r = fdiv <4 x double> %a, %x - ret <4 x double> %r -} - -define <4 x double> @foof_fmf(<4 x double> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: foof_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI2_0@toc@ha -; CHECK-NEXT: qvfrsqrtes 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI2_0@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: qvfmuls 4, 3, 3 -; CHECK-NEXT: qvfmsubs 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsubs 0, 2, 4, 0 -; CHECK-NEXT: qvfmuls 0, 3, 0 -; CHECK-NEXT: qvfmul 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %y = fpext <4 x float> %x to <4 x double> - %r = fdiv arcp reassoc nsz <4 x double> %a, %y - ret <4 x double> %r -} - -define <4 x double> @foof_safe(<4 x double> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: foof_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 2 -; CHECK-NEXT: fsqrts 4, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: fsqrts 0, 0 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: qvgpci 5, 275 -; CHECK-NEXT: qvgpci 6, 101 -; CHECK-NEXT: qvfperm 0, 3, 0, 5 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 2, 5 -; CHECK-NEXT: qvfperm 0, 2, 0, 6 -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 4, 0, 3 -; CHECK-NEXT: fdiv 2, 2, 4 -; CHECK-NEXT: qvesplati 4, 0, 2 -; CHECK-NEXT: fdiv 3, 3, 4 -; CHECK-NEXT: qvesplati 4, 1, 1 -; CHECK-NEXT: fdiv 1, 1, 0 -; CHECK-NEXT: qvesplati 0, 0, 1 -; CHECK-NEXT: fdiv 0, 4, 0 -; CHECK-NEXT: qvfperm 2, 3, 2, 5 -; CHECK-NEXT: qvfperm 0, 1, 0, 5 -; CHECK-NEXT: qvfperm 1, 0, 2, 6 -; CHECK-NEXT: blr -entry: - %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %y = fpext <4 x float> %x to <4 x double> - %r = fdiv <4 x double> %a, %y - ret <4 x double> %r -} - -define <4 x float> @food_fmf(<4 x float> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: food_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha -; CHECK-NEXT: qvfrsqrte 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfmsub 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsub 4, 2, 4, 0 -; CHECK-NEXT: qvfmul 3, 3, 4 -; CHECK-NEXT: qvfmul 4, 3, 3 -; CHECK-NEXT: qvfnmsub 0, 2, 4, 0 -; CHECK-NEXT: qvfmul 0, 3, 0 -; CHECK-NEXT: qvfrsp 0, 0 -; CHECK-NEXT: qvfmuls 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %y = fptrunc <4 x double> %x to <4 x float> - %r = fdiv arcp reassoc <4 x float> %a, %y - ret <4 x float> %r -} - -define <4 x float> @food_safe(<4 x float> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: food_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 2 -; CHECK-NEXT: fsqrt 4, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: fsqrt 0, 0 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: qvgpci 5, 275 -; CHECK-NEXT: qvgpci 6, 101 -; CHECK-NEXT: qvfperm 0, 3, 0, 5 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 2, 5 -; CHECK-NEXT: qvfperm 0, 2, 0, 6 -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvfrsp 0, 0 -; CHECK-NEXT: qvesplati 4, 0, 3 -; CHECK-NEXT: fdivs 2, 2, 4 -; CHECK-NEXT: qvesplati 4, 0, 2 -; CHECK-NEXT: fdivs 3, 3, 4 -; CHECK-NEXT: qvesplati 4, 1, 1 -; CHECK-NEXT: fdivs 1, 1, 0 -; CHECK-NEXT: qvesplati 0, 0, 1 -; CHECK-NEXT: fdivs 0, 4, 0 -; CHECK-NEXT: qvfperm 2, 3, 2, 5 -; CHECK-NEXT: qvfperm 0, 1, 0, 5 -; CHECK-NEXT: qvfperm 1, 0, 2, 6 -; CHECK-NEXT: blr -entry: - %x = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %b) - %y = fptrunc <4 x double> %x to <4 x float> - %r = fdiv <4 x float> %a, %y - ret <4 x float> %r -} - -define <4 x float> @goo_fmf(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI6_0@toc@ha -; CHECK-NEXT: qvfrsqrtes 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI6_0@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: qvfmuls 4, 3, 3 -; CHECK-NEXT: qvfmsubs 2, 2, 0, 2 -; CHECK-NEXT: qvfnmsubs 0, 2, 4, 0 -; CHECK-NEXT: qvfmuls 0, 3, 0 -; CHECK-NEXT: qvfmuls 1, 1, 0 -; CHECK-NEXT: blr -entry: - %x = call afn ninf reassoc <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %r = fdiv arcp reassoc nsz <4 x float> %a, %x - ret <4 x float> %r -} - -define <4 x float> @goo_safe(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 5, 2, 3 -; CHECK-NEXT: qvesplati 3, 2, 1 -; CHECK-NEXT: qvesplati 4, 2, 2 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: fsqrts 5, 5 -; CHECK-NEXT: fsqrts 4, 4 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: qvesplati 6, 1, 3 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdivs 2, 1, 2 -; CHECK-NEXT: fdivs 5, 6, 5 -; CHECK-NEXT: qvesplati 6, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdivs 4, 6, 4 -; CHECK-NEXT: fdivs 1, 1, 3 -; CHECK-NEXT: qvfperm 3, 4, 5, 0 -; CHECK-NEXT: qvfperm 0, 2, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 3, 1 -; CHECK-NEXT: blr -entry: - %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b) - %r = fdiv <4 x float> %a, %x - ret <4 x float> %r -} - -define <4 x double> @foo2_fmf(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo2_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI8_0@toc@ha -; CHECK-NEXT: qvfre 3, 2 -; CHECK-NEXT: addi 3, 3, .LCPI8_0@toc@l -; CHECK-NEXT: qvlfdx 0, 0, 3 -; CHECK-NEXT: qvfmadd 0, 2, 3, 0 -; CHECK-NEXT: qvfnmsub 0, 3, 0, 3 -; CHECK-NEXT: qvfmul 3, 1, 0 -; CHECK-NEXT: qvfnmsub 1, 2, 3, 1 -; CHECK-NEXT: qvfmadd 1, 0, 1, 3 -; CHECK-NEXT: blr -entry: - %r = fdiv arcp reassoc nsz ninf <4 x double> %a, %b - ret <4 x double> %r -} - -define <4 x double> @foo2_safe(<4 x double> %a, <4 x double> %b) nounwind { -; CHECK-LABEL: foo2_safe: -; CHECK: # %bb.0: -; CHECK-NEXT: qvesplati 3, 2, 3 -; CHECK-NEXT: qvesplati 4, 1, 3 -; CHECK-NEXT: qvesplati 5, 2, 2 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdiv 3, 4, 3 -; CHECK-NEXT: qvesplati 4, 1, 2 -; CHECK-NEXT: fdiv 4, 4, 5 -; CHECK-NEXT: fdiv 5, 1, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdiv 1, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 3, 0 -; CHECK-NEXT: qvfperm 0, 5, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr - %r = fdiv <4 x double> %a, %b - ret <4 x double> %r -} - -define <4 x float> @goo2_fmf(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo2_fmf: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvfres 0, 2 -; CHECK-NEXT: qvfmuls 3, 1, 0 -; CHECK-NEXT: qvfnmsubs 1, 2, 3, 1 -; CHECK-NEXT: qvfmadds 1, 0, 1, 3 -; CHECK-NEXT: blr -entry: - %r = fdiv arcp reassoc ninf <4 x float> %a, %b - ret <4 x float> %r -} - -define <4 x float> @goo2_safe(<4 x float> %a, <4 x float> %b) nounwind { -; CHECK-LABEL: goo2_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 3, 2, 3 -; CHECK-NEXT: qvesplati 4, 1, 3 -; CHECK-NEXT: qvesplati 5, 2, 2 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: fdivs 3, 4, 3 -; CHECK-NEXT: qvesplati 4, 1, 2 -; CHECK-NEXT: fdivs 4, 4, 5 -; CHECK-NEXT: fdivs 5, 1, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fdivs 1, 1, 2 -; CHECK-NEXT: qvfperm 2, 4, 3, 0 -; CHECK-NEXT: qvfperm 0, 5, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = fdiv <4 x float> %a, %b - ret <4 x float> %r -} - -define <4 x double> @foo3_fmf_denorm_on(<4 x double> %a) #0 { -; CHECK-LABEL: foo3_fmf_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI12_0@toc@ha -; CHECK-NEXT: qvfrsqrte 0, 1 -; CHECK-NEXT: addi 3, 3, .LCPI12_0@toc@l -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI12_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI12_1@toc@l -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfmsub 4, 1, 2, 1 -; CHECK-NEXT: qvfnmsub 3, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 3 -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfnmsub 2, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 2 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI12_2@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI12_2@toc@l -; CHECK-NEXT: qvlfdx 3, 0, 3 -; CHECK-NEXT: qvfmul 0, 0, 1 -; CHECK-NEXT: qvfabs 1, 1 -; CHECK-NEXT: qvfcmplt 1, 1, 2 -; CHECK-NEXT: qvfsel 1, 1, 3, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc ninf afn <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_fmf_denorm_off(<4 x double> %a) #1 { -; CHECK-LABEL: foo3_fmf_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI13_0@toc@ha -; CHECK-NEXT: qvfrsqrte 0, 1 -; CHECK-NEXT: addi 3, 3, .LCPI13_0@toc@l -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI13_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI13_1@toc@l -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfmsub 4, 1, 2, 1 -; CHECK-NEXT: qvfnmsub 3, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 3 -; CHECK-NEXT: qvfmul 3, 0, 0 -; CHECK-NEXT: qvfnmsub 2, 4, 3, 2 -; CHECK-NEXT: qvfmul 0, 0, 2 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: qvfmul 0, 0, 1 -; CHECK-NEXT: qvfcmpeq 1, 1, 2 -; CHECK-NEXT: qvfsel 1, 1, 2, 0 -; CHECK-NEXT: blr -entry: - %r = call afn reassoc ninf <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_safe_denorm_on(<4 x double> %a) #0 { -; CHECK-LABEL: foo3_safe_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrt 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x double> @foo3_safe_denorm_off(<4 x double> %a) #1 { -; CHECK-LABEL: foo3_safe_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrt 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrt 2, 2 -; CHECK-NEXT: fsqrt 3, 3 -; CHECK-NEXT: fsqrt 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) - ret <4 x double> %r -} - -define <4 x float> @goo3_fmf_denorm_on(<4 x float> %a) #0 { -; CHECK-LABEL: goo3_fmf_denorm_on: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI16_1@toc@ha -; CHECK-NEXT: qvfrsqrtes 2, 1 -; CHECK-NEXT: addi 3, 3, .LCPI16_1@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI16_0@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI16_0@toc@l -; CHECK-NEXT: qvfmuls 4, 2, 2 -; CHECK-NEXT: qvfmsubs 3, 1, 0, 1 -; CHECK-NEXT: qvfnmsubs 0, 3, 4, 0 -; CHECK-NEXT: qvlfsx 3, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI16_2@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI16_2@toc@l -; CHECK-NEXT: qvlfsx 4, 0, 3 -; CHECK-NEXT: qvfmuls 0, 2, 0 -; CHECK-NEXT: qvfabs 2, 1 -; CHECK-NEXT: qvfmuls 0, 0, 1 -; CHECK-NEXT: qvfcmplt 1, 2, 3 -; CHECK-NEXT: qvfsel 1, 1, 4, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc afn ninf nsz <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -define <4 x float> @goo3_fmf_denorm_off(<4 x float> %a) #1 { -; CHECK-LABEL: goo3_fmf_denorm_off: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI17_1@toc@ha -; CHECK-NEXT: qvfrsqrtes 2, 1 -; CHECK-NEXT: addi 3, 3, .LCPI17_1@toc@l -; CHECK-NEXT: qvlfsx 0, 0, 3 -; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l -; CHECK-NEXT: qvfmuls 4, 2, 2 -; CHECK-NEXT: qvfmsubs 3, 1, 0, 1 -; CHECK-NEXT: qvfnmsubs 0, 3, 4, 0 -; CHECK-NEXT: qvlfsx 3, 0, 3 -; CHECK-NEXT: qvfmuls 0, 2, 0 -; CHECK-NEXT: qvfmuls 0, 0, 1 -; CHECK-NEXT: qvfcmpeq 1, 1, 3 -; CHECK-NEXT: qvfsel 1, 1, 3, 0 -; CHECK-NEXT: blr -entry: - %r = call reassoc ninf afn nsz <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -define <4 x float> @goo3_safe(<4 x float> %a) nounwind { -; CHECK-LABEL: goo3_safe: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 2, 1, 3 -; CHECK-NEXT: qvesplati 3, 1, 2 -; CHECK-NEXT: fsqrts 4, 1 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: fsqrts 2, 2 -; CHECK-NEXT: fsqrts 3, 3 -; CHECK-NEXT: fsqrts 1, 1 -; CHECK-NEXT: qvgpci 0, 275 -; CHECK-NEXT: qvfperm 2, 3, 2, 0 -; CHECK-NEXT: qvfperm 0, 4, 1, 0 -; CHECK-NEXT: qvgpci 1, 101 -; CHECK-NEXT: qvfperm 1, 0, 2, 1 -; CHECK-NEXT: blr -entry: - %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) - ret <4 x float> %r -} - -attributes #0 = { nounwind "denormal-fp-math"="ieee,ieee" } -attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll deleted file mode 100644 index ee3357156a6c0..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-rounding-ops.ll +++ /dev/null @@ -1,109 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2q -enable-unsafe-fp-math | FileCheck -check-prefix=CHECK-FM %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -define <4 x float> @test1(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test1: -; CHECK: qvfrim 1, 1 - -; CHECK-FM: test1: -; CHECK-FM: qvfrim 1, 1 -} - -declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test2(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test2: -; CHECK: qvfrim 1, 1 - -; CHECK-FM: test2: -; CHECK-FM: qvfrim 1, 1 -} - -declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test3(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test3: -; CHECK-NOT: qvfrin - -; CHECK-FM: test3: -; CHECK-FM-NOT: qvfrin -} - -declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test4(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test4: -; CHECK-NOT: qvfrin - -; CHECK-FM: test4: -; CHECK-FM-NOT: qvfrin -} - -declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test5(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test5: -; CHECK: qvfrip 1, 1 - -; CHECK-FM: test5: -; CHECK-FM: qvfrip 1, 1 -} - -declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test6(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test6: -; CHECK: qvfrip 1, 1 - -; CHECK-FM: test6: -; CHECK-FM: qvfrip 1, 1 -} - -declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone - -define <4 x float> @test9(<4 x float> %x) nounwind { - %call = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone - ret <4 x float> %call - -; CHECK: test9: -; CHECK: qvfriz 1, 1 - -; CHECK-FM: test9: -; CHECK-FM: qvfriz 1, 1 -} - -declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone - -define <4 x double> @test10(<4 x double> %x) nounwind { - %call = tail call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone - ret <4 x double> %call - -; CHECK: test10: -; CHECK: qvfriz 1, 1 - -; CHECK-FM: test10: -; CHECK-FM: qvfriz 1, 1 -} - -declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-load.ll b/llvm/test/CodeGen/PowerPC/qpx-s-load.ll deleted file mode 100644 index 57d7e3b0ded3c..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-load.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define <4 x float> @foo(<4 x float>* %p) { -entry: - %v = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %v -} - -; CHECK: @foo -; CHECK-DAG: li [[REG1:[0-9]+]], 15 -; CHECK-DAG: qvlfsx [[REG4:[0-9]+]], 0, 3 -; CHECK-DAG: qvlfsx [[REG2:[0-9]+]], 3, [[REG1]] -; CHECK-DAG: qvlpclsx [[REG3:[0-9]+]], 0, 3 -; CHECK-DAG: qvfperm 1, [[REG4]], [[REG2]], [[REG3]] -; CHECK: blr - -define <4 x float> @bar(<4 x float>* %p) { -entry: - %v = load <4 x float>, <4 x float>* %p, align 16 - ret <4 x float> %v -} - -; CHECK: @bar -; CHECK: qvlfsx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll b/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll deleted file mode 100644 index 5d42b9a529953..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-sel.ll +++ /dev/null @@ -1,143 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -@R = global <4 x i1> , align 16 - -define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) nounwind readnone { -entry: - %r = select <4 x i1> %c, <4 x float> %a, <4 x float> %b - ret <4 x float> %r - -; CHECK-LABEL: @test1 -; CHECK: qvfsel 1, 3, 1, 2 -; CHECK: blr -} - -define <4 x float> @test2(<4 x float> %a, <4 x float> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone { -entry: - %v = insertelement <4 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2 - %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3 - %r = select <4 x i1> %v4, <4 x float> %a, <4 x float> %b - ret <4 x float> %r - -; CHECK-LABEL: @test2 -; CHECK: stw -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - -define <4 x i1> @test3(<4 x i1> %a) nounwind readnone { -entry: - %v = and <4 x i1> %a, - ret <4 x i1> %v - -; CHECK-LABEL: @test3 -; CHECK: qvlfsx [[REG:[0-9]+]], -; qvflogical 1, 1, [[REG]], 1 -; blr -} - -define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind { -entry: - %q = load <4 x i1>, <4 x i1>* %t, align 16 - %v = and <4 x i1> %a, %q - ret <4 x i1> %v - -; CHECK-LABEL: @test4 -; CHECK-DAG: lbz -; CHECK-DAG: qvlfdx [[REG1:[0-9]+]], -; CHECK-DAG: stw -; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]] -; CHECK: qvfand 1, 1, [[REG4]] -; CHECK: blr -} - -define void @test5(<4 x i1> %a) nounwind { -entry: - store <4 x i1> %a, <4 x i1>* @R - ret void - -; CHECK-LABEL: @test5 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: stb -; CHECK: blr -} - -define i1 @test6(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test6 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define i1 @test7(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - %s = extractelement <4 x i1> %a, i32 3 - %q = and i1 %r, %s - ret i1 %q - -; CHECK-LABEL: @test7 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG4:[0-9]+]], -; FIXME: We're storing the vector twice, and that's silly. -; CHECK-DAG: qvstfiwx [[REG3]], -; CHECK: lwz [[REG5:[0-9]+]], -; CHECK: and 3, -; CHECK: blr -} - -define i1 @test8(<3 x i1> %a) nounwind { -entry: - %r = extractelement <3 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test8 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define <3 x float> @test9(<3 x float> %a, <3 x float> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone { -entry: - %v = insertelement <3 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2 - %r = select <3 x i1> %v3, <3 x float> %a, <3 x float> %b - ret <3 x float> %r - -; CHECK-LABEL: @test9 -; CHECK: stw -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-s-store.ll b/llvm/test/CodeGen/PowerPC/qpx-s-store.ll deleted file mode 100644 index 81cff7b6457f1..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-s-store.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define void @foo(<4 x float> %v, <4 x float>* %p) { -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void -} - -; CHECK: @foo -; CHECK: stfs -; CHECK: stfs -; CHECK: stfs -; CHECK: stfs -; CHECK: blr - -define void @bar(<4 x float> %v, <4 x float>* %p) { -entry: - store <4 x float> %v, <4 x float>* %p, align 16 - ret void -} - -; CHECK: @bar -; CHECK: qvstfsx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-sel.ll b/llvm/test/CodeGen/PowerPC/qpx-sel.ll deleted file mode 100644 index abc92c9e98b13..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-sel.ll +++ /dev/null @@ -1,151 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -@R = global <4 x i1> , align 16 - -define <4 x double> @test1(<4 x double> %a, <4 x double> %b, <4 x i1> %c) nounwind readnone { -entry: - %r = select <4 x i1> %c, <4 x double> %a, <4 x double> %b - ret <4 x double> %r - -; CHECK-LABEL: @test1 -; CHECK: qvfsel 1, 3, 1, 2 -; CHECK: blr -} - -define <4 x double> @test2(<4 x double> %a, <4 x double> %b, i1 %c1, i1 %c2, i1 %c3, i1 %c4) nounwind readnone { -entry: - %v = insertelement <4 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <4 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <4 x i1> %v2, i1 %c3, i32 2 - %v4 = insertelement <4 x i1> %v3, i1 %c4, i32 3 - %r = select <4 x i1> %v4, <4 x double> %a, <4 x double> %b - ret <4 x double> %r - -; CHECK-LABEL: @test2 - -; FIXME: This load/store sequence is unnecessary. -; CHECK-DAG: lbz -; CHECK-DAG: stw - -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - -define <4 x i1> @test3(<4 x i1> %a) nounwind readnone { -entry: - %v = and <4 x i1> %a, - ret <4 x i1> %v - -; CHECK-LABEL: @test3 -; CHECK: qvlfsx [[REG:[0-9]+]], -; qvflogical 1, 1, [[REG]], 1 -; blr -} - -define <4 x i1> @test4(<4 x i1> %a, <4 x i1>* %t) nounwind { -entry: - %q = load <4 x i1>, <4 x i1>* %t, align 16 - %v = and <4 x i1> %a, %q - ret <4 x i1> %v - -; CHECK-LABEL: @test4 -; CHECK-DAG: lbz -; CHECK-DAG: qvlfdx [[REG1:[0-9]+]], -; CHECK-DAG: stw -; CHECK-DAG: qvlfiwzx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG1]] -; CHECK: qvfand 1, 1, [[REG4]] -; CHECK: blr -} - -define void @test5(<4 x i1> %a) nounwind { -entry: - store <4 x i1> %a, <4 x i1>* @R - ret void - -; CHECK-LABEL: @test5 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: stb -; CHECK: blr -} - -define i1 @test6(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test6 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define i1 @test7(<4 x i1> %a) nounwind { -entry: - %r = extractelement <4 x i1> %a, i32 2 - %s = extractelement <4 x i1> %a, i32 3 - %q = and i1 %r, %s - ret i1 %q - -; CHECK-LABEL: @test7 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG4:[0-9]+]], -; FIXME: We're storing the vector twice, and that's silly. -; CHECK-DAG: qvstfiwx [[REG3]], -; CHECK-DAG: lwz [[REG5:[0-9]+]], -; CHECK: and 3, -; CHECK: blr -} - -define i1 @test8(<3 x i1> %a) nounwind { -entry: - %r = extractelement <3 x i1> %a, i32 2 - ret i1 %r - -; CHECK-LABEL: @test8 -; CHECK: qvlfdx [[REG1:[0-9]+]], -; CHECK: qvfmadd [[REG2:[0-9]+]], 1, [[REG1]], [[REG1]] -; CHECK: qvfctiwu [[REG3:[0-9]+]], [[REG2]] -; CHECK: qvstfiwx [[REG3]], -; CHECK: lwz -; CHECK: blr -} - -define <3 x double> @test9(<3 x double> %a, <3 x double> %b, i1 %c1, i1 %c2, i1 %c3) nounwind readnone { -entry: - %v = insertelement <3 x i1> undef, i1 %c1, i32 0 - %v2 = insertelement <3 x i1> %v, i1 %c2, i32 1 - %v3 = insertelement <3 x i1> %v2, i1 %c3, i32 2 - %r = select <3 x i1> %v3, <3 x double> %a, <3 x double> %b - ret <3 x double> %r - -; CHECK-LABEL: @test9 - -; FIXME: This load/store sequence is unnecessary. -; CHECK-DAG: lbz -; CHECK-DAG: stw - -; CHECK-DAG: qvlfiwzx [[REG1:[0-9]+]], -; CHECK-DAG: qvlfdx [[REG2:[0-9]+]], -; CHECK-DAG: qvfcfidu [[REG3:[0-9]+]], [[REG1]] -; CHECK: qvfcmpeq [[REG4:[0-9]+]], [[REG3]], [[REG2]] -; CHECK: qvfsel 1, [[REG4]], 1, 2 -; CHECK: blr -} - diff --git a/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll b/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll deleted file mode 100644 index df3e0befaef8a..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-split-vsetcc.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-bgq-linux" - -; Function Attrs: nounwind -define void @gsl_sf_legendre_Pl_deriv_array(<4 x i32> %inp1, <4 x double> %inp2) #0 { -entry: - br label %vector.body198 - -vector.body198: ; preds = %vector.body198, %for.body46.lr.ph - %0 = icmp ne <4 x i32> %inp1, zeroinitializer - %1 = select <4 x i1> %0, <4 x double> , <4 x double> - %2 = fmul <4 x double> %inp2, %1 - %3 = fmul <4 x double> %inp2, %2 - %4 = fmul <4 x double> %3, %inp2 - store <4 x double> %4, <4 x double>* undef, align 8 - br label %return - -; CHECK-LABEL: @gsl_sf_legendre_Pl_deriv_array -; CHECK: qvlfiwzx -; CHECK: qvfcfidu -; CHECK: qvfcmpeq -; CHECK: qvfsel -; CHECK: qvfmul - -return: ; preds = %if.else.i - ret void -} - -attributes #0 = { nounwind } - diff --git a/llvm/test/CodeGen/PowerPC/qpx-store.ll b/llvm/test/CodeGen/PowerPC/qpx-store.ll deleted file mode 100644 index 2b96576ce4493..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-store.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target triple = "powerpc64-bgq-linux" - -define void @foo(<4 x double> %v, <4 x double>* %p) { -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void -} - -; CHECK: @foo -; CHECK: stfd -; CHECK: stfd -; CHECK: stfd -; CHECK: stfd -; CHECK: blr - -define void @bar(<4 x double> %v, <4 x double>* %p) { -entry: - store <4 x double> %v, <4 x double>* %p, align 32 - ret void -} - -; CHECK: @bar -; CHECK: qvstfdx - diff --git a/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll b/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll deleted file mode 100644 index e7ab92db6efc9..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-unal-cons-lds.ll +++ /dev/null @@ -1,217 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s -target datalayout = "E-m:e-i64:64-n32:64" -target triple = "powerpc64-unknown-linux-gnu" - -; Function Attrs: nounwind -define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 { -entry: - br label %vector.body - -; CHECK-LABEL: @foo -; Make sure that the offset constants we use are all even (only the last should be odd). -; CHECK-DAG: li {{[0-9]+}}, 1056 -; CHECK-DAG: li {{[0-9]+}}, 1088 -; CHECK-DAG: li {{[0-9]+}}, 1152 -; CHECK-DAG: li {{[0-9]+}}, 1216 -; CHECK-DAG: li {{[0-9]+}}, 1280 -; CHECK-DAG: li {{[0-9]+}}, 1344 -; CHECK-DAG: li {{[0-9]+}}, 1408 -; CHECK-DAG: li {{[0-9]+}}, 1472 -; CHECK-DAG: li {{[0-9]+}}, 1536 -; CHECK-DAG: li {{[0-9]+}}, 1600 -; CHECK-DAG: li {{[0-9]+}}, 1568 -; CHECK-DAG: li {{[0-9]+}}, 1664 -; CHECK-DAG: li {{[0-9]+}}, 1632 -; CHECK-DAG: li {{[0-9]+}}, 1728 -; CHECK-DAG: li {{[0-9]+}}, 1696 -; CHECK-DAG: li {{[0-9]+}}, 1792 -; CHECK-DAG: li {{[0-9]+}}, 1760 -; CHECK-DAG: li {{[0-9]+}}, 1856 -; CHECK-DAG: li {{[0-9]+}}, 1824 -; CHECK-DAG: li {{[0-9]+}}, 1920 -; CHECK-DAG: li {{[0-9]+}}, 1888 -; CHECK-DAG: li {{[0-9]+}}, 1984 -; CHECK-DAG: li {{[0-9]+}}, 1952 -; CHECK-DAG: li {{[0-9]+}}, 2016 -; CHECK-DAG: li {{[0-9]+}}, 1024 -; CHECK-DAG: li {{[0-9]+}}, 1120 -; CHECK-DAG: li {{[0-9]+}}, 1184 -; CHECK-DAG: li {{[0-9]+}}, 1248 -; CHECK-DAG: li {{[0-9]+}}, 1312 -; CHECK-DAG: li {{[0-9]+}}, 1376 -; CHECK-DAG: li {{[0-9]+}}, 1440 -; CHECK-DAG: li {{[0-9]+}}, 1504 -; CHECK-DAG: li {{[0-9]+}}, 2047 -; CHECK: blr - -vector.body: ; preds = %vector.body, %entry - %index = phi i64 [ 0, %entry ], [ %index.next.15, %vector.body ] - %0 = shl i64 %index, 1 - %1 = getelementptr inbounds double, double* %b, i64 %0 - %2 = bitcast double* %1 to <8 x double>* - %wide.vec = load <8 x double>, <8 x double>* %2, align 8 - %strided.vec = shufflevector <8 x double> %wide.vec, <8 x double> undef, <4 x i32> - %3 = fadd <4 x double> %strided.vec, - %4 = getelementptr inbounds double, double* %a, i64 %index - %5 = bitcast double* %4 to <4 x double>* - store <4 x double> %3, <4 x double>* %5, align 8 - %index.next = or i64 %index, 4 - %6 = shl i64 %index.next, 1 - %7 = getelementptr inbounds double, double* %b, i64 %6 - %8 = bitcast double* %7 to <8 x double>* - %wide.vec.1 = load <8 x double>, <8 x double>* %8, align 8 - %strided.vec.1 = shufflevector <8 x double> %wide.vec.1, <8 x double> undef, <4 x i32> - %9 = fadd <4 x double> %strided.vec.1, - %10 = getelementptr inbounds double, double* %a, i64 %index.next - %11 = bitcast double* %10 to <4 x double>* - store <4 x double> %9, <4 x double>* %11, align 8 - %index.next.1 = or i64 %index, 8 - %12 = shl i64 %index.next.1, 1 - %13 = getelementptr inbounds double, double* %b, i64 %12 - %14 = bitcast double* %13 to <8 x double>* - %wide.vec.2 = load <8 x double>, <8 x double>* %14, align 8 - %strided.vec.2 = shufflevector <8 x double> %wide.vec.2, <8 x double> undef, <4 x i32> - %15 = fadd <4 x double> %strided.vec.2, - %16 = getelementptr inbounds double, double* %a, i64 %index.next.1 - %17 = bitcast double* %16 to <4 x double>* - store <4 x double> %15, <4 x double>* %17, align 8 - %index.next.2 = or i64 %index, 12 - %18 = shl i64 %index.next.2, 1 - %19 = getelementptr inbounds double, double* %b, i64 %18 - %20 = bitcast double* %19 to <8 x double>* - %wide.vec.3 = load <8 x double>, <8 x double>* %20, align 8 - %strided.vec.3 = shufflevector <8 x double> %wide.vec.3, <8 x double> undef, <4 x i32> - %21 = fadd <4 x double> %strided.vec.3, - %22 = getelementptr inbounds double, double* %a, i64 %index.next.2 - %23 = bitcast double* %22 to <4 x double>* - store <4 x double> %21, <4 x double>* %23, align 8 - %index.next.3 = or i64 %index, 16 - %24 = shl i64 %index.next.3, 1 - %25 = getelementptr inbounds double, double* %b, i64 %24 - %26 = bitcast double* %25 to <8 x double>* - %wide.vec.4 = load <8 x double>, <8 x double>* %26, align 8 - %strided.vec.4 = shufflevector <8 x double> %wide.vec.4, <8 x double> undef, <4 x i32> - %27 = fadd <4 x double> %strided.vec.4, - %28 = getelementptr inbounds double, double* %a, i64 %index.next.3 - %29 = bitcast double* %28 to <4 x double>* - store <4 x double> %27, <4 x double>* %29, align 8 - %index.next.4 = or i64 %index, 20 - %30 = shl i64 %index.next.4, 1 - %31 = getelementptr inbounds double, double* %b, i64 %30 - %32 = bitcast double* %31 to <8 x double>* - %wide.vec.5 = load <8 x double>, <8 x double>* %32, align 8 - %strided.vec.5 = shufflevector <8 x double> %wide.vec.5, <8 x double> undef, <4 x i32> - %33 = fadd <4 x double> %strided.vec.5, - %34 = getelementptr inbounds double, double* %a, i64 %index.next.4 - %35 = bitcast double* %34 to <4 x double>* - store <4 x double> %33, <4 x double>* %35, align 8 - %index.next.5 = or i64 %index, 24 - %36 = shl i64 %index.next.5, 1 - %37 = getelementptr inbounds double, double* %b, i64 %36 - %38 = bitcast double* %37 to <8 x double>* - %wide.vec.6 = load <8 x double>, <8 x double>* %38, align 8 - %strided.vec.6 = shufflevector <8 x double> %wide.vec.6, <8 x double> undef, <4 x i32> - %39 = fadd <4 x double> %strided.vec.6, - %40 = getelementptr inbounds double, double* %a, i64 %index.next.5 - %41 = bitcast double* %40 to <4 x double>* - store <4 x double> %39, <4 x double>* %41, align 8 - %index.next.6 = or i64 %index, 28 - %42 = shl i64 %index.next.6, 1 - %43 = getelementptr inbounds double, double* %b, i64 %42 - %44 = bitcast double* %43 to <8 x double>* - %wide.vec.7 = load <8 x double>, <8 x double>* %44, align 8 - %strided.vec.7 = shufflevector <8 x double> %wide.vec.7, <8 x double> undef, <4 x i32> - %45 = fadd <4 x double> %strided.vec.7, - %46 = getelementptr inbounds double, double* %a, i64 %index.next.6 - %47 = bitcast double* %46 to <4 x double>* - store <4 x double> %45, <4 x double>* %47, align 8 - %index.next.7 = or i64 %index, 32 - %48 = shl i64 %index.next.7, 1 - %49 = getelementptr inbounds double, double* %b, i64 %48 - %50 = bitcast double* %49 to <8 x double>* - %wide.vec.8 = load <8 x double>, <8 x double>* %50, align 8 - %strided.vec.8 = shufflevector <8 x double> %wide.vec.8, <8 x double> undef, <4 x i32> - %51 = fadd <4 x double> %strided.vec.8, - %52 = getelementptr inbounds double, double* %a, i64 %index.next.7 - %53 = bitcast double* %52 to <4 x double>* - store <4 x double> %51, <4 x double>* %53, align 8 - %index.next.8 = or i64 %index, 36 - %54 = shl i64 %index.next.8, 1 - %55 = getelementptr inbounds double, double* %b, i64 %54 - %56 = bitcast double* %55 to <8 x double>* - %wide.vec.9 = load <8 x double>, <8 x double>* %56, align 8 - %strided.vec.9 = shufflevector <8 x double> %wide.vec.9, <8 x double> undef, <4 x i32> - %57 = fadd <4 x double> %strided.vec.9, - %58 = getelementptr inbounds double, double* %a, i64 %index.next.8 - %59 = bitcast double* %58 to <4 x double>* - store <4 x double> %57, <4 x double>* %59, align 8 - %index.next.9 = or i64 %index, 40 - %60 = shl i64 %index.next.9, 1 - %61 = getelementptr inbounds double, double* %b, i64 %60 - %62 = bitcast double* %61 to <8 x double>* - %wide.vec.10 = load <8 x double>, <8 x double>* %62, align 8 - %strided.vec.10 = shufflevector <8 x double> %wide.vec.10, <8 x double> undef, <4 x i32> - %63 = fadd <4 x double> %strided.vec.10, - %64 = getelementptr inbounds double, double* %a, i64 %index.next.9 - %65 = bitcast double* %64 to <4 x double>* - store <4 x double> %63, <4 x double>* %65, align 8 - %index.next.10 = or i64 %index, 44 - %66 = shl i64 %index.next.10, 1 - %67 = getelementptr inbounds double, double* %b, i64 %66 - %68 = bitcast double* %67 to <8 x double>* - %wide.vec.11 = load <8 x double>, <8 x double>* %68, align 8 - %strided.vec.11 = shufflevector <8 x double> %wide.vec.11, <8 x double> undef, <4 x i32> - %69 = fadd <4 x double> %strided.vec.11, - %70 = getelementptr inbounds double, double* %a, i64 %index.next.10 - %71 = bitcast double* %70 to <4 x double>* - store <4 x double> %69, <4 x double>* %71, align 8 - %index.next.11 = or i64 %index, 48 - %72 = shl i64 %index.next.11, 1 - %73 = getelementptr inbounds double, double* %b, i64 %72 - %74 = bitcast double* %73 to <8 x double>* - %wide.vec.12 = load <8 x double>, <8 x double>* %74, align 8 - %strided.vec.12 = shufflevector <8 x double> %wide.vec.12, <8 x double> undef, <4 x i32> - %75 = fadd <4 x double> %strided.vec.12, - %76 = getelementptr inbounds double, double* %a, i64 %index.next.11 - %77 = bitcast double* %76 to <4 x double>* - store <4 x double> %75, <4 x double>* %77, align 8 - %index.next.12 = or i64 %index, 52 - %78 = shl i64 %index.next.12, 1 - %79 = getelementptr inbounds double, double* %b, i64 %78 - %80 = bitcast double* %79 to <8 x double>* - %wide.vec.13 = load <8 x double>, <8 x double>* %80, align 8 - %strided.vec.13 = shufflevector <8 x double> %wide.vec.13, <8 x double> undef, <4 x i32> - %81 = fadd <4 x double> %strided.vec.13, - %82 = getelementptr inbounds double, double* %a, i64 %index.next.12 - %83 = bitcast double* %82 to <4 x double>* - store <4 x double> %81, <4 x double>* %83, align 8 - %index.next.13 = or i64 %index, 56 - %84 = shl i64 %index.next.13, 1 - %85 = getelementptr inbounds double, double* %b, i64 %84 - %86 = bitcast double* %85 to <8 x double>* - %wide.vec.14 = load <8 x double>, <8 x double>* %86, align 8 - %strided.vec.14 = shufflevector <8 x double> %wide.vec.14, <8 x double> undef, <4 x i32> - %87 = fadd <4 x double> %strided.vec.14, - %88 = getelementptr inbounds double, double* %a, i64 %index.next.13 - %89 = bitcast double* %88 to <4 x double>* - store <4 x double> %87, <4 x double>* %89, align 8 - %index.next.14 = or i64 %index, 60 - %90 = shl i64 %index.next.14, 1 - %91 = getelementptr inbounds double, double* %b, i64 %90 - %92 = bitcast double* %91 to <8 x double>* - %wide.vec.15 = load <8 x double>, <8 x double>* %92, align 8 - %strided.vec.15 = shufflevector <8 x double> %wide.vec.15, <8 x double> undef, <4 x i32> - %93 = fadd <4 x double> %strided.vec.15, - %94 = getelementptr inbounds double, double* %a, i64 %index.next.14 - %95 = bitcast double* %94 to <4 x double>* - store <4 x double> %93, <4 x double>* %95, align 8 - %index.next.15 = add nsw i64 %index, 64 - %96 = icmp eq i64 %index.next.15, 1600 - br i1 %96, label %for.cond.cleanup, label %vector.body - -for.cond.cleanup: ; preds = %vector.body - ret void -} - -attributes #0 = { nounwind "target-cpu"="a2q" } - diff --git a/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll b/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll deleted file mode 100644 index fdee919fdfc32..0000000000000 --- a/llvm/test/CodeGen/PowerPC/qpx-unalperm.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc -verify-machineinstrs < %s -mcpu=a2q | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" - -define <4 x double> @foo(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 32 - ret <4 x double> %r -; CHECK: qvlfdx -; CHECK: blr -} - -define <4 x double> @bar(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 16 - %s = load <4 x double>, <4 x double>* %b, align 32 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -; CHECK: qvlpcldx -; CHECK: qvlfdx -; CHECK: qvfperm -; CHECK: blr -} - -define <4 x double> @bar1(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 16 - %s = load <4 x double>, <4 x double>* %b, align 8 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar2(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 32 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar3(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 8 - %t = fadd <4 x double> %r, %s - ret <4 x double> %t -} - -define <4 x double> @bar4(<4 x double>* %a) { -entry: - %r = load <4 x double>, <4 x double>* %a, align 8 - %b = getelementptr <4 x double>, <4 x double>* %a, i32 1 - %s = load <4 x double>, <4 x double>* %b, align 8 - %c = getelementptr <4 x double>, <4 x double>* %b, i32 1 - %t = load <4 x double>, <4 x double>* %c, align 8 - %u = fadd <4 x double> %r, %s - %v = fadd <4 x double> %u, %t - ret <4 x double> %v -} - diff --git a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll index e8fc409527588..d512f51a76e7a 100644 --- a/llvm/test/CodeGen/PowerPC/rlwimi-and.ll +++ b/llvm/test/CodeGen/PowerPC/rlwimi-and.ll @@ -1,6 +1,4 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-crbits < %s | FileCheck %s -target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-crbits -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s define void @test() align 2 { entry: diff --git a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir index e3aeb5605b42c..dbe314b5251fe 100644 --- a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir +++ b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir @@ -60,7 +60,7 @@ ret i64 %cond } - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/CodeGen/PowerPC/rounding-rm-flag.ll b/llvm/test/CodeGen/PowerPC/rounding-rm-flag.ll new file mode 100644 index 0000000000000..0af781f6657b6 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/rounding-rm-flag.ll @@ -0,0 +1,26 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -stop-after=early-ifcvt < %s | FileCheck %s + +define float @test_XSRDPI(float %f) { +entry: + %0 = tail call float @llvm.round.f32(float %f) + ret float %0 + +; CHECK-LABEL: name: test_XSRDPI +; CHECK-NOT: %2:vsfrc = nofpexcept XSRDPI killed %1, implicit $rm +; CHECK: %2:vsfrc = nofpexcept XSRDPI killed %1 +} + +define double @test_XSRDPIM(double %d) { +entry: + %0 = tail call double @llvm.floor.f64(double %d) + ret double %0 + +; CHECK-LABEL: name: test_XSRDPIM +; CHECK-NOT: %1:vsfrc = nofpexcept XSRDPIM %0, implicit $rm +; CHECK: %1:vsfrc = nofpexcept XSRDPIM %0 +} + +declare float @llvm.round.f32(float) +declare double @llvm.floor.f64(double) + diff --git a/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll b/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll index 5c15145af2378..20071ea1710c5 100644 --- a/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll +++ b/llvm/test/CodeGen/PowerPC/s000-alias-misched.ll @@ -1,7 +1,6 @@ -; RUN: llc -verify-machineinstrs < %s -enable-misched -mcpu=a2 | FileCheck %s -; RUN: llc -verify-machineinstrs < %s -enable-misched -enable-aa-sched-mi -mcpu=a2 | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -enable-misched -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -enable-misched -enable-aa-sched-mi -mcpu=a2 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" -target triple = "powerpc64-bgq-linux" @aa = external global [256 x [256 x double]], align 32 @bb = external global [256 x [256 x double]], align 32 diff --git a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll index 80ac733156197..9f458ebcf0a6e 100644 --- a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll +++ b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll @@ -1225,576 +1225,5 @@ entry: ; CHECK: blr } -define <4 x double> @testqv4doubleslt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleslt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleult(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublesle(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleule(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleeq(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleeq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x double> @testqv4doublesge(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleuge(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleuge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublesgt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublesgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doubleugt(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doubleugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x double> @testqv4doublene(float %c1, float %c2, float %c3, float %c4, <4 x double> %a1, <4 x double> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x double> %a1, <4 x double> %a2 - ret <4 x double> %cond - -; CHECK-LABEL: @testqv4doublene -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x float> @testqv4floatslt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatslt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatult(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatsle(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatule(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floateq(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floateq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x float> @testqv4floatsge(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatuge(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatuge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatsgt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatsgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatugt(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x float> @testqv4floatne(float %c1, float %c2, float %c3, float %c4, <4 x float> %a1, <4 x float> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2 - ret <4 x float> %cond - -; CHECK-LABEL: @testqv4floatne -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x i1> @testqv4i1slt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp slt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1slt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ult(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ult i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ult -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sle(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sle i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sle -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ule(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ule i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ule -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1eq(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp eq i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1eq -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: creqv [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sge(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1uge(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp uge i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1uge -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB]] -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1sgt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp sgt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1sgt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 4, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 4, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ugt(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ugt i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ugt -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK: bc 12, 2, .LBB[[BB1:[0-9_]+]] -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: bc 12, 2, .LBB[[BB2:[0-9_]+]] -; CHECK: .LBB[[BB1]]: -; CHECK: qvfmr 5, 6 -; CHECK: .LBB[[BB2]]: -; CHECK: qvfmr 1, 5 -; CHECK: blr -} - -define <4 x i1> @testqv4i1ne(float %c1, float %c2, float %c3, float %c4, <4 x i1> %a1, <4 x i1> %a2) #1 { -entry: - %cmp1 = fcmp oeq float %c3, %c4 - %cmp3tmp = fcmp oeq float %c1, %c2 - %cmp3 = icmp ne i1 %cmp3tmp, %cmp1 - %cond = select i1 %cmp3, <4 x i1> %a1, <4 x i1> %a2 - ret <4 x i1> %cond - -; CHECK-LABEL: @testqv4i1ne -; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4 -; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2 -; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} -; CHECK: bclr 12, [[REG1]], 0 -; CHECK: qvfmr 1, 6 -; CHECK: blr -} - attributes #0 = { nounwind readnone "target-cpu"="pwr7" } -attributes #1 = { nounwind readnone "target-cpu"="a2q" } diff --git a/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll b/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll index 53d17d8668270..73fce78c33aa7 100644 --- a/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll +++ b/llvm/test/CodeGen/PowerPC/selectiondag-extload-computeknownbits.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=powerpc64-bgq-linux < %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux < %s ; Check that llc does not crash due to an illegal APInt operation diff --git a/llvm/test/CodeGen/PowerPC/setcc-vector.ll b/llvm/test/CodeGen/PowerPC/setcc-vector.ll new file mode 100644 index 0000000000000..5917ccabf84ed --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/setcc-vector.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck -check-prefixes=CHECK-PWR9 %s +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck -check-prefixes=CHECK-PWR8 %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-unknown-unknown \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck -check-prefixes=CHECK-PWR7 %s + +define <1 x i64> @setcc_v1i128(<1 x i128> %a) { +; CHECK-PWR9-LABEL: setcc_v1i128: +; CHECK-PWR9: # %bb.0: # %entry +; CHECK-PWR9-NEXT: mfvsrld r3, vs34 +; CHECK-PWR9-NEXT: cmpldi r3, 35708 +; CHECK-PWR9-NEXT: mfvsrd r3, vs34 +; CHECK-PWR9-NEXT: cmpdi cr1, r3, 0 +; CHECK-PWR9-NEXT: li r3, 1 +; CHECK-PWR9-NEXT: crnand 4*cr5+lt, 4*cr1+eq, lt +; CHECK-PWR9-NEXT: isel r3, 0, r3, 4*cr5+lt +; CHECK-PWR9-NEXT: blr +; +; CHECK-PWR8-LABEL: setcc_v1i128: +; CHECK-PWR8: # %bb.0: # %entry +; CHECK-PWR8-NEXT: xxswapd vs0, vs34 +; CHECK-PWR8-NEXT: mfvsrd r3, vs34 +; CHECK-PWR8-NEXT: cmpdi r3, 0 +; CHECK-PWR8-NEXT: li r3, 1 +; CHECK-PWR8-NEXT: mffprd r4, f0 +; CHECK-PWR8-NEXT: cmpldi cr1, r4, 35708 +; CHECK-PWR8-NEXT: crnand 4*cr5+lt, eq, 4*cr1+lt +; CHECK-PWR8-NEXT: isel r3, 0, r3, 4*cr5+lt +; CHECK-PWR8-NEXT: blr +; +; CHECK-PWR7-LABEL: setcc_v1i128: +; CHECK-PWR7: # %bb.0: # %entry +; CHECK-PWR7-NEXT: li r5, 0 +; CHECK-PWR7-NEXT: cntlzd r3, r3 +; CHECK-PWR7-NEXT: ori r5, r5, 35708 +; CHECK-PWR7-NEXT: rldicl r3, r3, 58, 63 +; CHECK-PWR7-NEXT: subc r5, r4, r5 +; CHECK-PWR7-NEXT: subfe r4, r4, r4 +; CHECK-PWR7-NEXT: neg r4, r4 +; CHECK-PWR7-NEXT: and r3, r3, r4 +; CHECK-PWR7-NEXT: blr +entry: + %0 = icmp ult <1 x i128> %a, + %1 = zext <1 x i1> %0 to <1 x i64> + ret <1 x i64> %1 +} + diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc.mir b/llvm/test/CodeGen/PowerPC/setcr_bc.mir index e9d81da681fcc..564ee7d45957b 100644 --- a/llvm/test/CodeGen/PowerPC/setcr_bc.mir +++ b/llvm/test/CodeGen/PowerPC/setcr_bc.mir @@ -32,8 +32,8 @@ ret i32 %call2.i.sink } - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir index 582284d6d0a59..513cb85e1580a 100644 --- a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir +++ b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir @@ -32,8 +32,8 @@ ret i32 %call2.i.sink } - attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } ... --- diff --git a/llvm/test/CodeGen/PowerPC/setrnd.ll b/llvm/test/CodeGen/PowerPC/setrnd.ll index a732e3f73c992..9080a4b0ee51f 100644 --- a/llvm/test/CodeGen/PowerPC/setrnd.ll +++ b/llvm/test/CodeGen/PowerPC/setrnd.ll @@ -19,8 +19,8 @@ entry: ; AFTER-FINALIZE-ISEL: test_setrndi ; AFTER-FINALIZE-ISEL: MFFS implicit $rm -; AFTER-FINALIZE-ISEL: MTFSB0 31, implicit-def $rm, implicit $rm -; AFTER-FINALIZE-ISEL: MTFSB1 30, implicit-def $rm, implicit $rm +; AFTER-FINALIZE-ISEL: MTFSB0 31, implicit-def $rm +; AFTER-FINALIZE-ISEL: MTFSB1 30, implicit-def $rm ; CHECK-LABEL: @test_setrndi ; CHECK: # %bb.0: @@ -40,7 +40,7 @@ entry: ; AFTER-FINALIZE-ISEL: test_setrnd ; AFTER-FINALIZE-ISEL: MFFS implicit $rm -; AFTER-FINALIZE-ISEL: MTFSF 255, %7, 0, 0 +; AFTER-FINALIZE-ISEL: MTFSF 255, %7, 0, 0, implicit-def $rm ; CHECK-LABEL: @test_setrnd ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/PowerPC/spe-fastmath.ll b/llvm/test/CodeGen/PowerPC/spe-fastmath.ll new file mode 100644 index 0000000000000..d2b83f7ee1da2 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/spe-fastmath.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc-unknown-linux-gnu \ +; RUN: -mattr=+spe | FileCheck %s + +define void @no_fsel(i32 %e) #0 { +; CHECK-LABEL: no_fsel: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 4, .LCPI0_0@l +; CHECK-NEXT: lis 5, .LCPI0_0@ha +; CHECK-NEXT: evlddx 4, 5, 4 +; CHECK-NEXT: efdcfui 3, 3 +; CHECK-NEXT: efdmul 5, 3, 3 +; CHECK-NEXT: efdcmpeq 0, 5, 4 +; CHECK-NEXT: ble 0, .LBB0_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: evor 3, 4, 4 +; CHECK-NEXT: .LBB0_2: # %entry +; CHECK-NEXT: efdctsiz 3, 3 +; CHECK-NEXT: sth 3, 0(3) +; CHECK-NEXT: blr +entry: + %conv = uitofp i32 %e to double + %mul = fmul double %conv, %conv + %tobool = fcmp une double %mul, 0.000000e+00 + %cond = select i1 %tobool, double %conv, double 0.000000e+00 + %conv3 = fptosi double %cond to i16 + store i16 %conv3, i16* undef + ret void +} + +attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/PowerPC/spe.ll b/llvm/test/CodeGen/PowerPC/spe.ll index 1c4c7a3398177..1e4cecb1d136d 100644 --- a/llvm/test/CodeGen/PowerPC/spe.ll +++ b/llvm/test/CodeGen/PowerPC/spe.ll @@ -1422,3 +1422,64 @@ for.body: ; preds = %for.body, %entry declare float @llvm.fma.f32(float, float, float) #1 attributes #1 = { nounwind readnone speculatable willreturn } + +%struct.a = type { float, float } + +define void @d(%struct.a* %e, %struct.a* %f) { +; CHECK-LABEL: d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stw 0, 4(1) +; CHECK-NEXT: stwu 1, -48(1) +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset lr, 4 +; CHECK-NEXT: .cfi_offset r29, -12 +; CHECK-NEXT: .cfi_offset r30, -8 +; CHECK-NEXT: .cfi_offset r29, -40 +; CHECK-NEXT: .cfi_offset r30, -32 +; CHECK-NEXT: lwz 4, 0(4) +; CHECK-NEXT: lwz 3, 0(3) +; CHECK-NEXT: stw 29, 36(1) # 4-byte Folded Spill +; CHECK-NEXT: evstdd 29, 8(1) # 8-byte Folded Spill +; CHECK-NEXT: efdcfs 29, 4 +; CHECK-NEXT: stw 30, 40(1) # 4-byte Folded Spill +; CHECK-NEXT: mr 4, 29 +; CHECK-NEXT: evstdd 30, 16(1) # 8-byte Folded Spill +; CHECK-NEXT: efdcfs 30, 3 +; CHECK-NEXT: evmergehi 3, 29, 29 +; CHECK-NEXT: mtctr 3 +; CHECK-NEXT: # kill: def $r3 killed $r3 killed $s3 +; CHECK-NEXT: bctrl +; CHECK-NEXT: evmergehi 3, 30, 30 +; CHECK-NEXT: mr 4, 30 +; CHECK-NEXT: mtctr 3 +; CHECK-NEXT: # kill: def $r3 killed $r3 killed $s3 +; CHECK-NEXT: bctrl +; CHECK-NEXT: li 3, .LCPI58_0@l +; CHECK-NEXT: lis 4, .LCPI58_0@ha +; CHECK-NEXT: evlddx 3, 4, 3 +; CHECK-NEXT: evldd 30, 16(1) # 8-byte Folded Reload +; CHECK-NEXT: efdmul 3, 29, 3 +; CHECK-NEXT: evldd 29, 8(1) # 8-byte Folded Reload +; CHECK-NEXT: efscfd 3, 3 +; CHECK-NEXT: stw 3, 0(3) +; CHECK-NEXT: lwz 30, 40(1) # 4-byte Folded Reload +; CHECK-NEXT: lwz 29, 36(1) # 4-byte Folded Reload +; CHECK-NEXT: lwz 0, 52(1) +; CHECK-NEXT: addi 1, 1, 48 +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr +entry: + %0 = getelementptr %struct.a, %struct.a* %f, i32 0, i32 0 + %1 = load float, float* undef + %conv = fpext float %1 to double + %2 = load float, float* %0 + %g = fpext float %2 to double + %3 = call i32 undef(double %g) + %h = call i32 undef(double %conv) + %n = sitofp i32 %3 to double + %k = fmul double %g, %n + %l = fptrunc double %k to float + store float %l, float* undef + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/stwu-sched.ll b/llvm/test/CodeGen/PowerPC/stwu-sched.ll index 0afd2ee406894..36afaf84a296b 100644 --- a/llvm/test/CodeGen/PowerPC/stwu-sched.ll +++ b/llvm/test/CodeGen/PowerPC/stwu-sched.ll @@ -58,7 +58,7 @@ define void @initCombList(%0* nocapture, i32 signext) local_unnamed_addr #0 { ret void } -attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0} diff --git a/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll b/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll index 497add38e0444..79a368dd095ac 100644 --- a/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll +++ b/llvm/test/CodeGen/PowerPC/unal-vec-ldst.ll @@ -327,72 +327,6 @@ entry: } -define <4 x float> @test_l_qv4float(<4 x float>* %p) #1 { -; CHECK-LABEL: test_l_qv4float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 15 -; CHECK-NEXT: qvlpclsx 0, 0, 3 -; CHECK-NEXT: qvlfsx 1, 3, 4 -; CHECK-NEXT: qvlfsx 2, 0, 3 -; CHECK-NEXT: qvfperm 1, 2, 1, 0 -; CHECK-NEXT: blr -entry: - %r = load <4 x float>, <4 x float>* %p, align 4 - ret <4 x float> %r - -} - -define <8 x float> @test_l_qv8float(<8 x float>* %p) #1 { -; CHECK-LABEL: test_l_qv8float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 31 -; CHECK-NEXT: qvlpclsx 1, 0, 3 -; CHECK-NEXT: qvlfsx 0, 3, 4 -; CHECK-NEXT: li 4, 16 -; CHECK-NEXT: qvlfsx 3, 3, 4 -; CHECK-NEXT: qvlfsx 4, 0, 3 -; CHECK-NEXT: qvfperm 2, 3, 0, 1 -; CHECK-NEXT: qvfperm 1, 4, 3, 1 -; CHECK-NEXT: blr -entry: - %r = load <8 x float>, <8 x float>* %p, align 4 - ret <8 x float> %r - -} - -define <4 x double> @test_l_qv4double(<4 x double>* %p) #1 { -; CHECK-LABEL: test_l_qv4double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 31 -; CHECK-NEXT: qvlpcldx 0, 0, 3 -; CHECK-NEXT: qvlfdx 1, 3, 4 -; CHECK-NEXT: qvlfdx 2, 0, 3 -; CHECK-NEXT: qvfperm 1, 2, 1, 0 -; CHECK-NEXT: blr -entry: - %r = load <4 x double>, <4 x double>* %p, align 8 - ret <4 x double> %r - -} - -define <8 x double> @test_l_qv8double(<8 x double>* %p) #1 { -; CHECK-LABEL: test_l_qv8double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 4, 63 -; CHECK-NEXT: qvlpcldx 1, 0, 3 -; CHECK-NEXT: qvlfdx 0, 3, 4 -; CHECK-NEXT: li 4, 32 -; CHECK-NEXT: qvlfdx 3, 3, 4 -; CHECK-NEXT: qvlfdx 4, 0, 3 -; CHECK-NEXT: qvfperm 2, 3, 0, 1 -; CHECK-NEXT: qvfperm 1, 4, 3, 1 -; CHECK-NEXT: blr -entry: - %r = load <8 x double>, <8 x double>* %p, align 8 - ret <8 x double> %r - -} - define void @test_s_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 { ; CHECK-LABEL: test_s_v16i8: ; CHECK: # %bb.0: # %entry @@ -537,89 +471,6 @@ entry: } -define void @test_s_qv4float(<4 x float>* %p, <4 x float> %v) #1 { -; CHECK-LABEL: test_s_qv4float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfs 1, 0(3) -; CHECK-NEXT: stfs 0, 12(3) -; CHECK-NEXT: qvesplati 0, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfs 0, 8(3) -; CHECK-NEXT: stfs 1, 4(3) -; CHECK-NEXT: blr -entry: - store <4 x float> %v, <4 x float>* %p, align 4 - ret void - -} - -define void @test_s_qv8float(<8 x float>* %p, <8 x float> %v) #1 { -; CHECK-LABEL: test_s_qv8float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: stfs 2, 16(3) -; CHECK-NEXT: stfs 0, 28(3) -; CHECK-NEXT: qvesplati 0, 2, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: stfs 1, 0(3) -; CHECK-NEXT: stfs 0, 24(3) -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfs 2, 20(3) -; CHECK-NEXT: qvesplati 2, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfs 0, 12(3) -; CHECK-NEXT: stfs 2, 8(3) -; CHECK-NEXT: stfs 1, 4(3) -; CHECK-NEXT: blr -entry: - store <8 x float> %v, <8 x float>* %p, align 4 - ret void - -} - -define void @test_s_qv4double(<4 x double>* %p, <4 x double> %v) #1 { -; CHECK-LABEL: test_s_qv4double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfd 1, 0(3) -; CHECK-NEXT: stfd 0, 24(3) -; CHECK-NEXT: qvesplati 0, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfd 0, 16(3) -; CHECK-NEXT: stfd 1, 8(3) -; CHECK-NEXT: blr -entry: - store <4 x double> %v, <4 x double>* %p, align 8 - ret void - -} - -define void @test_s_qv8double(<8 x double>* %p, <8 x double> %v) #1 { -; CHECK-LABEL: test_s_qv8double: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: qvesplati 0, 2, 3 -; CHECK-NEXT: stfd 2, 32(3) -; CHECK-NEXT: stfd 0, 56(3) -; CHECK-NEXT: qvesplati 0, 2, 2 -; CHECK-NEXT: qvesplati 2, 2, 1 -; CHECK-NEXT: stfd 1, 0(3) -; CHECK-NEXT: stfd 0, 48(3) -; CHECK-NEXT: qvesplati 0, 1, 3 -; CHECK-NEXT: stfd 2, 40(3) -; CHECK-NEXT: qvesplati 2, 1, 2 -; CHECK-NEXT: qvesplati 1, 1, 1 -; CHECK-NEXT: stfd 0, 24(3) -; CHECK-NEXT: stfd 2, 16(3) -; CHECK-NEXT: stfd 1, 8(3) -; CHECK-NEXT: blr -entry: - store <8 x double> %v, <8 x double>* %p, align 8 - ret void - -} - attributes #0 = { nounwind "target-cpu"="pwr7" } -attributes #1 = { nounwind "target-cpu"="a2q" } attributes #2 = { nounwind "target-cpu"="pwr8" } diff --git a/llvm/test/CodeGen/PowerPC/uwtables.ll b/llvm/test/CodeGen/PowerPC/uwtables.ll index 7523d04d73d38..e302934ab8d6b 100644 --- a/llvm/test/CodeGen/PowerPC/uwtables.ll +++ b/llvm/test/CodeGen/PowerPC/uwtables.ll @@ -47,5 +47,5 @@ declare i32 @__gxx_personality_v0(...) declare void @__cxa_call_unexpected(i8*) local_unnamed_addr -attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll index eff56a041ee27..d1151f29a9b20 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -23,14 +23,14 @@ define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: and r4, r12, #15 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vdup.32 q3, r4 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r2], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 -; CHECK-NEXT: vcmp.i32 eq, q3, zr +; CHECK-NEXT: vdup.32 q3, r4 +; CHECK-NEXT: vpt.i32 eq, q3, zr +; CHECK-NEXT: vmovt q1, q2 ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: vpsel q1, q2, q1 ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index e69610f9df2a9..261222f60f17a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1798,20 +1798,20 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh.w r4, [r3, #2] ; CHECK-NEXT: vldr.16 s2, [r2, #2] -; CHECK-NEXT: ldrsh r5, [r3, #-2] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmov s4, r4 -; CHECK-NEXT: ldrsh.w r4, [r3] +; CHECK-NEXT: ldrsh r4, [r3], #8 ; CHECK-NEXT: vcvt.f16.s32 s4, s4 -; CHECK-NEXT: vmov s8, r5 +; CHECK-NEXT: ldrsh r5, [r3, #-10] ; CHECK-NEXT: vmul.f16 s2, s2, s4 -; CHECK-NEXT: vldr.16 s4, [r2] ; CHECK-NEXT: vmov s6, r4 -; CHECK-NEXT: ldrsh r4, [r3, #-4] +; CHECK-NEXT: vldr.16 s4, [r2] ; CHECK-NEXT: vcvt.f16.s32 s6, s6 -; CHECK-NEXT: vcvt.f16.s32 s8, s8 +; CHECK-NEXT: ldrsh r4, [r3, #-12] ; CHECK-NEXT: vmul.f16 s4, s4, s6 +; CHECK-NEXT: vmov s8, r5 ; CHECK-NEXT: vldr.16 s6, [r2, #-2] +; CHECK-NEXT: vcvt.f16.s32 s8, s8 ; CHECK-NEXT: vmov s10, r4 ; CHECK-NEXT: vcvtb.f32.f16 s4, s4 ; CHECK-NEXT: vmul.f16 s6, s6, s8 @@ -1821,9 +1821,8 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n ; CHECK-NEXT: vmul.f16 s8, s8, s10 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 -; CHECK-NEXT: adds r3, #8 -; CHECK-NEXT: vadd.f32 s0, s0, s8 ; CHECK-NEXT: adds r2, #8 +; CHECK-NEXT: vadd.f32 s0, s0, s8 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NEXT: vadd.f32 s0, s0, s2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index 428c703dd341e..69039f9a4eaa2 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -437,17 +437,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldrb r8, [r5, #-2] -; CHECK-NEXT: ldrb r7, [r6] +; CHECK-NEXT: ldrb r7, [r6], #4 ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldrb r8, [r5, #-1] -; CHECK-NEXT: ldrb r7, [r6, #1] +; CHECK-NEXT: ldrb r7, [r6, #-3] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] -; CHECK-NEXT: ldrb.w r8, [r5] -; CHECK-NEXT: adds r5, #4 -; CHECK-NEXT: ldrb r7, [r6, #2] -; CHECK-NEXT: adds r6, #4 +; CHECK-NEXT: ldrb r8, [r5], #4 +; CHECK-NEXT: ldrb r7, [r6, #-2] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 @@ -740,17 +738,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-8] ; CHECK-NEXT: ldrb r8, [r5, #-2] -; CHECK-NEXT: ldrb r7, [r6] +; CHECK-NEXT: ldrb r7, [r6], #4 ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #-4] ; CHECK-NEXT: ldrb r8, [r5, #-1] -; CHECK-NEXT: ldrb r7, [r6, #1] +; CHECK-NEXT: ldrb r7, [r6, #-3] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4] -; CHECK-NEXT: ldrb.w r8, [r5] -; CHECK-NEXT: adds r5, #4 -; CHECK-NEXT: ldrb r7, [r6, #2] -; CHECK-NEXT: adds r6, #4 +; CHECK-NEXT: ldrb r8, [r5], #4 +; CHECK-NEXT: ldrb r7, [r6, #-2] ; CHECK-NEXT: smlabb r7, r7, r8, r2 ; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 diff --git a/llvm/test/CodeGen/Thumb2/constant-hoisting.ll b/llvm/test/CodeGen/Thumb2/constant-hoisting.ll index 5c8f934ce61d3..a106900dc3e99 100644 --- a/llvm/test/CodeGen/Thumb2/constant-hoisting.ll +++ b/llvm/test/CodeGen/Thumb2/constant-hoisting.ll @@ -37,26 +37,25 @@ define i32 @test_values(i32 %a, i32 %b) minsize optsize { ; CHECK-V7M: mov r2, r0 ; CHECK-V7M-NEXT: ldr r0, .LCPI0_0 ; CHECK-V7M-NEXT: cmp r2, #50 -; CHECK-V7M-NEXT: beq .LBB0_5 +; CHECK-V7M-NEXT: beq .LBB0_3 ; CHECK-V7M-NEXT: cmp r2, #1 -; CHECK-V7M-NEXT: beq .LBB0_7 +; CHECK-V7M-NEXT: ittt eq +; CHECK-V7M-NEXT: addeq r0, r1 +; CHECK-V7M-NEXT: addeq r0, #1 +; CHECK-V7M-NEXT: bxeq lr ; CHECK-V7M-NEXT: cmp r2, #30 -; CHECK-V7M-NEXT: beq .LBB0_8 -; CHECK-V7M-NEXT: cbnz r2, .LBB0_6 +; CHECK-V7M-NEXT: ittt eq +; CHECK-V7M-NEXT: addeq r0, r1 +; CHECK-V7M-NEXT: addeq r0, #2 +; CHECK-V7M-NEXT: bxeq lr +; CHECK-V7M-NEXT: cbnz r2, .LBB0_4 +; CHECK-V7M-NEXT: .LBB0_2: ; CHECK-V7M-NEXT: add r0, r1 ; CHECK-V7M-NEXT: bx lr -; CHECK-V7M-NEXT: .LBB0_5: +; CHECK-V7M-NEXT: .LBB0_3: ; CHECK-V7M-NEXT: add r0, r1 ; CHECK-V7M-NEXT: adds r0, #4 -; CHECK-V7M-NEXT: .LBB0_6: -; CHECK-V7M-NEXT: bx lr -; CHECK-V7M-NEXT: .LBB0_7: -; CHECK-V7M-NEXT: add r0, r1 -; CHECK-V7M-NEXT: adds r0, #1 -; CHECK-V7M-NEXT: bx lr -; CHECK-V7M-NEXT: .LBB0_8: -; CHECK-V7M-NEXT: add r0, r1 -; CHECK-V7M-NEXT: adds r0, #2 +; CHECK-V7M-NEXT: .LBB0_4: ; CHECK-V7M-NEXT: bx lr ; CHECK-V7M-NEXT: .p2align 2 ; CHECK-V7M-NEXT: .LCPI0_0: diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-diamonds.ll b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-diamonds.ll index b7ba741b24983..116084b276fc8 100644 --- a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-diamonds.ll +++ b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-diamonds.ll @@ -22,7 +22,8 @@ while.cond38: ; preds = %if.end111, %entry ; CHECK-NEXT: it eq ; CHECK-NEXT: ldreq ; CHECK-NEXT: it ne -; CHECK-NEXT: movsne + ; N.b. 16-bit mov instruction in IT block does not set flags. +; CHECK-NEXT: movne ; CHECK-NEXT: mvns ; CHECK-NEXT: b cond.true77: ; preds = %while.cond38 diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index e8ab7792b6dfc..1a53561388138 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1147,31 +1147,30 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca ; CHECK-NEXT: .LBB16_6: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r0, [r6] +; CHECK-NEXT: ldrh r0, [r6], #16 ; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: adds r1, r5, #2 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: ldrh r0, [r6, #2] +; CHECK-NEXT: ldrh r0, [r6, #-14] ; CHECK-NEXT: adds r1, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #4] +; CHECK-NEXT: ldrh r0, [r6, #-12] ; CHECK-NEXT: vldrw.u32 q1, [r5, #4] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: ldrh r0, [r6, #6] +; CHECK-NEXT: ldrh r0, [r6, #-10] ; CHECK-NEXT: add.w r1, r5, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #8] +; CHECK-NEXT: ldrh r0, [r6, #-8] ; CHECK-NEXT: vldrw.u32 q1, [r5, #8] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: ldrh r0, [r6, #10] -; CHECK-NEXT: ldrh r1, [r6, #14] +; CHECK-NEXT: ldrh r0, [r6, #-6] +; CHECK-NEXT: ldrh r1, [r6, #-2] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #12] +; CHECK-NEXT: ldrh r0, [r6, #-4] ; CHECK-NEXT: vldrw.u32 q1, [r5, #12] -; CHECK-NEXT: adds r6, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: add.w r0, r5, #14 ; CHECK-NEXT: vldrw.u32 q1, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll index 30ce13b850736..63990baf7fc84 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -266,6 +266,430 @@ entry: ret <8 x i16> %gather } +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vmov.i32 q1, #0x28 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrh.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep2(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI16_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep3(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI17_0 +; CHECK-NEXT: adr.w r12, .LCPI17_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI17_0: +; CHECK-NEXT: .long 131072 @ 0x20000 +; CHECK-NEXT: .long 131078 @ 0x20006 +; CHECK-NEXT: .long 131084 @ 0x2000c +; CHECK-NEXT: .long 131090 @ 0x20012 +; CHECK-NEXT: .LCPI17_1: +; CHECK-NEXT: .long 131096 @ 0x20018 +; CHECK-NEXT: .long 131102 @ 0x2001e +; CHECK-NEXT: .long 131108 @ 0x20024 +; CHECK-NEXT: .long 131114 @ 0x2002a +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 65536 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep4(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI18_0 +; CHECK-NEXT: adr.w r12, .LCPI18_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI18_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .LCPI18_1: +; CHECK-NEXT: .long 24 @ 0x18 +; CHECK-NEXT: .long 131072 @ 0x20000 +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 42 @ 0x2a +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep5(<8 x i16*> %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov.i32 q2, #0x20000 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q0[6], r1 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %base, i32 65536 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep6(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep6: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI20_0 +; CHECK-NEXT: adr.w r12, .LCPI20_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI20_0: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .LCPI20_1: +; CHECK-NEXT: .long 131074 @ 0x20002 +; CHECK-NEXT: .long 32 @ 0x20 +; CHECK-NEXT: .long 38 @ 0x26 +; CHECK-NEXT: .long 44 @ 0x2c +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 1 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep7(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_biggep7: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI21_0 +; CHECK-NEXT: adr.w r12, .LCPI21_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .long 128 @ 0x80 +; CHECK-NEXT: .long 1206 @ 0x4b6 +; CHECK-NEXT: .long 1212 @ 0x4bc +; CHECK-NEXT: .long 1218 @ 0x4c2 +; CHECK-NEXT: .LCPI21_1: +; CHECK-NEXT: .long 1224 @ 0x4c8 +; CHECK-NEXT: .long 1230 @ 0x4ce +; CHECK-NEXT: .long 1236 @ 0x4d4 +; CHECK-NEXT: .long 1242 @ 0x4da +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 600 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_basei32(i32* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_basei32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrh.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %offs.zext + %ptrs.cast = bitcast <8 x i32*> %ptrs to <8 x i16*> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs.cast, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll index c674ffbf51bd3..4c32200ee4a53 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -294,6 +294,45 @@ entry: ret <4 x i32> %gather.sext } +define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep(i32* %base, <4 x i32>* %offptr) { +; CHECK-LABEL: scaled_i32_i32_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmov.i32 q0, #0x14 +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs + %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %gather +} + +define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base, <4 x i32>* %offptr) { +; CHECK-LABEL: scaled_i32_i32_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI21_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe +entry: + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> + %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %gather +} + declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll index 60bffc5a31dbd..c7d29af67b5b1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -365,6 +365,812 @@ entry: ret <16 x i8> %gather } +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #12] +; CHECK-NEXT: vmov.i32 q2, #0x5 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r3, [r2] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q0, q2 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vadd.i32 q3, q0, q2 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r7, s15 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vldrb.s32 q3, [r1, #4] +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep2(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI10_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI11_0 +; CHECK-NEXT: adr r2, .LCPI11_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI11_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI11_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .long 280 @ 0x118 +; CHECK-NEXT: .long 283 @ 0x11b +; CHECK-NEXT: .long 286 @ 0x11e +; CHECK-NEXT: .long 289 @ 0x121 +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 292 @ 0x124 +; CHECK-NEXT: .long 295 @ 0x127 +; CHECK-NEXT: .long 298 @ 0x12a +; CHECK-NEXT: .long 301 @ 0x12d +; CHECK-NEXT: .LCPI11_2: +; CHECK-NEXT: .long 256 @ 0x100 +; CHECK-NEXT: .long 259 @ 0x103 +; CHECK-NEXT: .long 262 @ 0x106 +; CHECK-NEXT: .long 265 @ 0x109 +; CHECK-NEXT: .LCPI11_3: +; CHECK-NEXT: .long 268 @ 0x10c +; CHECK-NEXT: .long 271 @ 0x10f +; CHECK-NEXT: .long 274 @ 0x112 +; CHECK-NEXT: .long 277 @ 0x115 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 256 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI12_0 +; CHECK-NEXT: adr r2, .LCPI12_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI12_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI12_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .long 256 @ 0x100 +; CHECK-NEXT: .long 27 @ 0x1b +; CHECK-NEXT: .long 30 @ 0x1e +; CHECK-NEXT: .long 33 @ 0x21 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 39 @ 0x27 +; CHECK-NEXT: .long 42 @ 0x2a +; CHECK-NEXT: .long 45 @ 0x2d +; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI12_3: +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 15 @ 0xf +; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 21 @ 0x15 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x i8*> %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.i32 q4, #0x100 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: ldrb.w r12, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r3, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vadd.i32 q3, q0, q4 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[3], r5 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[4], r5 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[6], r5 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: vmov.8 q0[8], r4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], lr +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.8 q0[15], r1 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %base, i32 256 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep6: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: adr r2, .LCPI14_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI14_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI14_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .long 257 @ 0x101 +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 31 @ 0x1f +; CHECK-NEXT: .long 34 @ 0x22 +; CHECK-NEXT: .LCPI14_1: +; CHECK-NEXT: .long 37 @ 0x25 +; CHECK-NEXT: .long 40 @ 0x28 +; CHECK-NEXT: .long 43 @ 0x2b +; CHECK-NEXT: .long 46 @ 0x2e +; CHECK-NEXT: .LCPI14_2: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI14_3: +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 16 @ 0x10 +; CHECK-NEXT: .long 19 @ 0x13 +; CHECK-NEXT: .long 22 @ 0x16 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 1 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep7: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: adr r2, .LCPI15_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI15_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI15_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .long 224 @ 0xe0 +; CHECK-NEXT: .long 227 @ 0xe3 +; CHECK-NEXT: .long 230 @ 0xe6 +; CHECK-NEXT: .long 233 @ 0xe9 +; CHECK-NEXT: .LCPI15_1: +; CHECK-NEXT: .long 236 @ 0xec +; CHECK-NEXT: .long 239 @ 0xef +; CHECK-NEXT: .long 242 @ 0xf2 +; CHECK-NEXT: .long 245 @ 0xf5 +; CHECK-NEXT: .LCPI15_2: +; CHECK-NEXT: .long 300 @ 0x12c +; CHECK-NEXT: .long 203 @ 0xcb +; CHECK-NEXT: .long 206 @ 0xce +; CHECK-NEXT: .long 209 @ 0xd1 +; CHECK-NEXT: .LCPI15_3: +; CHECK-NEXT: .long 212 @ 0xd4 +; CHECK-NEXT: .long 215 @ 0xd7 +; CHECK-NEXT: .long 218 @ 0xda +; CHECK-NEXT: .long 221 @ 0xdd +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 200 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrb.s32 q0, [r1, #12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_3(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI17_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI17_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(i16* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_basei16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vldrb.u32 q2, [r1, #4] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrb.u32 q0, [r1, #12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <16 x i32> %offs.zext + %ptrs.cast = bitcast <16 x i16*> %ptrs to <16 x i8*> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs.cast, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll index 22f6cd6bc3d05..e594fee53f9dc 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -S -o 2>/dev/null - | FileCheck %s +; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -S -o 2>/dev/null - | FileCheck %s define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: @push_out_add_sub_block( diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll new file mode 100644 index 0000000000000..7e01d24d006aa --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll @@ -0,0 +1,1012 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py + +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s + +define void @ptr_iv_v4i32(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) { +; CHECK-LABEL: ptr_iv_v4i32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI0_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i32 q1, q1, r2 +; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 12 @ 0xc +vector.ph: + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i32* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i32, i32* %pointer.phi, i32 16 + %1 = getelementptr i32, i32* %pointer.phi, <4 x i32> + %2 = getelementptr i32, i32* %pointer.phi13, i32 16 + %3 = getelementptr i32, i32* %pointer.phi13, <4 x i32> + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> , <4 x i32> undef) + %4 = add nsw <4 x i32> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %4, <4 x i32*> %3, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) { +; CHECK-LABEL: ptr_iv_v4i32_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r1, .LCPI1_0 +; CHECK-NEXT: adr r3, .LCPI1_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 15 @ 0xf +vector.ph: + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i32* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i32, i32* %pointer.phi, i32 16 + %1 = getelementptr i32, i32* %pointer.phi, <4 x i32> + %gather.address = getelementptr i32, <4 x i32*> %1, i32 3 + %2 = getelementptr i32, i32* %pointer.phi13, i32 16 + %3 = getelementptr i32, i32* %pointer.phi13, <4 x i32> + %scatter.address = getelementptr i32, <4 x i32*> %1, i32 5 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gather.address, i32 4, <4 x i1> , <4 x i32> undef) + %4 = add nsw <4 x i32> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %4, <4 x i32*> %scatter.address, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8i16(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { +; CHECK-LABEL: ptr_iv_v8i16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI2_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i16 q1, q1, r2 +; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 24 @ 0x18 +; CHECK-NEXT: .short 28 @ 0x1c +vector.ph: + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i16* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i16, i16* %pointer.phi, i32 32 + %1 = getelementptr i16, i16* %pointer.phi, <8 x i16> + %2 = getelementptr i16, i16* %pointer.phi13, i32 32 + %3 = getelementptr i16, i16* %pointer.phi13, <8 x i16> + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %1, i32 4, <8 x i1> , <8 x i16> undef) + %4 = add nsw <8 x i16> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %4, <8 x i16*> %3, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { +; CHECK-LABEL: ptr_iv_v8i16_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr.w r12, .LCPI3_0 +; CHECK-NEXT: adr r3, .LCPI3_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i16 q2, q2, r2 +; CHECK-NEXT: vstrh.16 q2, [r1, q1, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI3_0: +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .short 25 @ 0x19 +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 33 @ 0x21 +; CHECK-NEXT: .LCPI3_1: +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 27 @ 0x1b +; CHECK-NEXT: .short 31 @ 0x1f +vector.ph: + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i16* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i16, i16* %pointer.phi, i32 32 + %1 = getelementptr i16, i16* %pointer.phi, <8 x i16> + %gather.address = getelementptr i16, <8 x i16*> %1, i16 3 + %2 = getelementptr i16, i16* %pointer.phi13, i32 32 + %3 = getelementptr i16, i16* %pointer.phi13, <8 x i16> + %scatter.address = getelementptr i16, <8 x i16*> %3, i16 5 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gather.address, i32 4, <8 x i1> , <8 x i16> undef) + %4 = add nsw <8 x i16> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %4, <8 x i16*> %scatter.address, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v16i8(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { +; CHECK-LABEL: ptr_iv_v16i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI4_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i8 q1, q1, r2 +; CHECK-NEXT: vstrb.8 q1, [r1, q0] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI4_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 16 @ 0x10 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 28 @ 0x1c +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 40 @ 0x28 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 48 @ 0x30 +; CHECK-NEXT: .byte 52 @ 0x34 +; CHECK-NEXT: .byte 56 @ 0x38 +; CHECK-NEXT: .byte 60 @ 0x3c +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 + %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i8* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i8, i8* %pointer.phi, i32 64 + %1 = getelementptr i8, i8* %pointer.phi, <16 x i8> + %2 = getelementptr i8, i8* %pointer.phi13, i32 64 + %3 = getelementptr i8, i8* %pointer.phi13, <16 x i8> + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %1, i32 4, <16 x i1> , <16 x i8> undef) + %4 = add nsw <16 x i8> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %4, <16 x i8*> %3, i32 4, <16 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define void @ptr_iv_v16i8_mult(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { +; CHECK-LABEL: ptr_iv_v16i8_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr.w r12, .LCPI5_0 +; CHECK-NEXT: adr r3, .LCPI5_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q2, [r0, q0] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i8 q2, q2, r2 +; CHECK-NEXT: vstrb.8 q2, [r1, q1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI5_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 25 @ 0x19 +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 37 @ 0x25 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 45 @ 0x2d +; CHECK-NEXT: .byte 49 @ 0x31 +; CHECK-NEXT: .byte 53 @ 0x35 +; CHECK-NEXT: .byte 57 @ 0x39 +; CHECK-NEXT: .byte 61 @ 0x3d +; CHECK-NEXT: .byte 65 @ 0x41 +; CHECK-NEXT: .LCPI5_1: +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 19 @ 0x13 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 31 @ 0x1f +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 43 @ 0x2b +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 51 @ 0x33 +; CHECK-NEXT: .byte 55 @ 0x37 +; CHECK-NEXT: .byte 59 @ 0x3b +; CHECK-NEXT: .byte 63 @ 0x3f +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 + %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i8* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i8, i8* %pointer.phi, i32 64 + %1 = getelementptr i8, i8* %pointer.phi, <16 x i8> + %gather.address = getelementptr i8, <16 x i8*> %1, i8 3 + %2 = getelementptr i8, i8* %pointer.phi13, i32 64 + %3 = getelementptr i8, i8* %pointer.phi13, <16 x i8> + %scatter.address = getelementptr i8, <16 x i8*> %3, i8 5 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gather.address, i32 4, <16 x i1> , <16 x i8> undef) + %4 = add nsw <16 x i8> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %4, <16 x i8*> %scatter.address, i32 4, <16 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4f32(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v4f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI6_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.f32 q1, q1, r2 +; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 12 @ 0xc +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi float* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi float* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr float, float* %pointer.phi, i32 16 + %1 = getelementptr float, float* %pointer.phi, <4 x i32> + %2 = getelementptr float, float* %pointer.phi13, i32 16 + %3 = getelementptr float, float* %pointer.phi13, <4 x i32> + %wide.masked.gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %1, i32 4, <4 x i1> , <4 x float> undef) + %4 = fadd <4 x float> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %4, <4 x float*> %3, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v4f32_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r1, .LCPI7_0 +; CHECK-NEXT: adr r3, .LCPI7_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vadd.f32 q2, q2, r2 +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI7_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .LCPI7_1: +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 15 @ 0xf +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi float* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi float* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr float, float* %pointer.phi, i32 16 + %1 = getelementptr float, float* %pointer.phi, <4 x i32> + %gather.address = getelementptr float, <4 x float*> %1, i32 3 + %2 = getelementptr float, float* %pointer.phi13, i32 16 + %3 = getelementptr float, float* %pointer.phi13, <4 x i32> + %scatter.address = getelementptr float, <4 x float*> %1, i32 5 + %wide.masked.gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gather.address, i32 4, <4 x i1> , <4 x float> undef) + %4 = fadd <4 x float> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %4, <4 x float*> %scatter.address, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8f16(half* noalias nocapture readonly %A, half* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v8f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: adr r3, .LCPI8_0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.f16 q1, q1, r2 +; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 24 @ 0x18 +; CHECK-NEXT: .short 28 @ 0x1c +vector.ph: + %y.trunc = fptrunc float %y to half + %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 + %broadcast.splat = shufflevector <8 x half> %broadcast.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi half* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi half* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr half, half* %pointer.phi, i32 32 + %1 = getelementptr half, half* %pointer.phi, <8 x i16> + %2 = getelementptr half, half* %pointer.phi13, i32 32 + %3 = getelementptr half, half* %pointer.phi13, <8 x i16> + %wide.masked.gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %1, i32 4, <8 x i1> , <8 x half> undef) + %4 = fadd <8 x half> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %4, <8 x half*> %3, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v8f16_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: adr r2, .LCPI9_0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adr r2, .LCPI9_1 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: vadd.f16 q2, q2, r1 +; CHECK-NEXT: vstrh.16 q2, [r0, q1, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB9_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 27 @ 0x1b +; CHECK-NEXT: .short 31 @ 0x1f +; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .short 25 @ 0x19 +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 33 @ 0x21 +vector.ph: + %y.trunc = fptrunc float %y to half + %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 + %broadcast.splat = shufflevector <8 x half> %broadcast.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi half* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi half* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr half, half* %pointer.phi, i32 32 + %1 = getelementptr half, half* %pointer.phi, <8 x i16> + %gather.address = getelementptr half, <8 x half*> %1, i32 3 + %2 = getelementptr half, half* %pointer.phi13, i32 32 + %3 = getelementptr half, half* %pointer.phi13, <8 x i16> + %scatter.address = getelementptr half, <8 x half*> %1, i32 5 + %wide.masked.gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gather.address, i32 4, <8 x i1> , <8 x half> undef) + %4 = fadd <8 x half> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %4, <8 x half*> %scatter.address, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x, i32* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v4i32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI10_0 +; CHECK-NEXT: adr.w lr, .LCPI10_1 +; CHECK-NEXT: adr r3, .LCPI10_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q4, [r0, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q5, [r0, q2, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [r0, q3, uxtw #2] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i32 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i32 q6, q5, q6 +; CHECK-NEXT: vmul.i32 q5, q5, q0 +; CHECK-NEXT: vstrw.32 q5, [r1, q2, uxtw #2] +; CHECK-NEXT: vstrw.32 q6, [r1, q3, uxtw #2] +; CHECK-NEXT: vstrw.32 q4, [r1, q1, uxtw #2] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB10_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI10_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI10_2: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i32* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i32, i32* %pointer.phi, <4 x i32> + %v3 = getelementptr i32, i32* %pointer.phi, i32 12 + %vector.gep56 = getelementptr i32, i32* %pointer.phi55, <4 x i32> + %v4 = getelementptr i32, i32* %pointer.phi55, i32 12 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i32, <4 x i32*> %vector.gep, i32 1 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %vector.gep, i32 4, <4 x i1> , <4 x i32> undef) + %v7 = getelementptr inbounds i32, <4 x i32*> %vector.gep, i32 2 + %wide.masked.gather57 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %v6, i32 4, <4 x i1> , <4 x i32> undef) + %wide.masked.gather58 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %v7, i32 4, <4 x i1> , <4 x i32> undef) + %v11 = mul nuw nsw <4 x i32> %wide.masked.gather, + %v13 = mul nuw nsw <4 x i32> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <4 x i32> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i32, <4 x i32*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v11, <4 x i32*> %vector.gep56, i32 4, <4 x i1> ) + %v18 = getelementptr inbounds i32, <4 x i32*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v13, <4 x i32*> %v17, i32 4, <4 x i1> ) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v15, <4 x i32*> %v18, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v4i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v4i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI11_0 +; CHECK-NEXT: adr.w lr, .LCPI11_1 +; CHECK-NEXT: adr r3, .LCPI11_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u32 q4, [r0, q1] +; CHECK-NEXT: vldrb.u32 q5, [r0, q2] +; CHECK-NEXT: vldrb.u32 q6, [r0, q3] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i32 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #12 +; CHECK-NEXT: vmul.i32 q6, q5, q6 +; CHECK-NEXT: vmul.i32 q5, q5, q0 +; CHECK-NEXT: vstrb.32 q5, [r1, q2] +; CHECK-NEXT: vstrb.32 q6, [r1, q3] +; CHECK-NEXT: vstrb.32 q4, [r1, q1] +; CHECK-NEXT: add.w r1, r1, #12 +; CHECK-NEXT: bne .LBB11_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI11_2: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %pointer.phi = phi i8* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i8* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i8, i8* %pointer.phi, <4 x i32> + %v3 = getelementptr i8, i8* %pointer.phi, i32 12 + %vector.gep56 = getelementptr i8, i8* %pointer.phi55, <4 x i32> + %v4 = getelementptr i8, i8* %pointer.phi55, i32 12 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i8, <4 x i8*> %vector.gep, i32 1 + %wide.masked.gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %vector.gep, i32 1, <4 x i1> , <4 x i8> undef) + %v7 = getelementptr inbounds i8, <4 x i8*> %vector.gep, i32 2 + %wide.masked.gather57 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %v6, i32 1, <4 x i1> , <4 x i8> undef) + %wide.masked.gather58 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %v7, i32 1, <4 x i1> , <4 x i8> undef) + %v8 = zext <4 x i8> %wide.masked.gather to <4 x i32> + %v9 = zext <4 x i8> %wide.masked.gather57 to <4 x i32> + %v10 = zext <4 x i8> %wide.masked.gather58 to <4 x i32> + %v11 = mul nuw nsw <4 x i32> %v8, + %v12 = trunc <4 x i32> %v11 to <4 x i8> + %v13 = mul nuw nsw <4 x i32> %v8, %v9 + %v14 = trunc <4 x i32> %v13 to <4 x i8> + %v15 = mul nuw nsw <4 x i32> %v8, %v10 + %v16 = trunc <4 x i32> %v15 to <4 x i8> + %v17 = getelementptr inbounds i8, <4 x i8*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v12, <4 x i8*> %vector.gep56, i32 1, <4 x i1> ) + %v18 = getelementptr inbounds i8, <4 x i8*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v14, <4 x i8*> %v17, i32 1, <4 x i1> ) + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v16, <4 x i8*> %v18, i32 1, <4 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, i16* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v8i16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI12_0 +; CHECK-NEXT: adr.w lr, .LCPI12_1 +; CHECK-NEXT: adr r3, .LCPI12_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i16 q0, #0xa +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q4, [r0, q1, uxtw #1] +; CHECK-NEXT: vldrh.u16 q5, [r0, q2, uxtw #1] +; CHECK-NEXT: vldrh.u16 q6, [r0, q3, uxtw #1] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i16 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i16 q6, q5, q6 +; CHECK-NEXT: vmul.i16 q5, q5, q0 +; CHECK-NEXT: vstrh.16 q5, [r1, q2, uxtw #1] +; CHECK-NEXT: vstrh.16 q6, [r1, q3, uxtw #1] +; CHECK-NEXT: vstrh.16 q4, [r1, q1, uxtw #1] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 22 @ 0x16 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i16* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i16, i16* %pointer.phi, <8 x i16> + %v3 = getelementptr i16, i16* %pointer.phi, i32 24 + %vector.gep56 = getelementptr i16, i16* %pointer.phi55, <8 x i16> + %v4 = getelementptr i16, i16* %pointer.phi55, i32 24 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i16, <8 x i16*> %vector.gep, i16 1 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %vector.gep, i32 2, <8 x i1> , <8 x i16> undef) + %v7 = getelementptr inbounds i16, <8 x i16*> %vector.gep, i16 2 + %wide.masked.gather57 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %v6, i32 2, <8 x i1> , <8 x i16> undef) + %wide.masked.gather58 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %v7, i32 2, <8 x i1> , <8 x i16> undef) + %v11 = mul nuw nsw <8 x i16> %wide.masked.gather, + %v13 = mul nuw nsw <8 x i16> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <8 x i16> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i16, <8 x i16*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v11, <8 x i16*> %vector.gep56, i32 2, <8 x i1> ) + %v18 = getelementptr inbounds i16, <8 x i16*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v13, <8 x i16*> %v17, i32 2, <8 x i1> ) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v15, <8 x i16*> %v18, i32 2, <8 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v16i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v16i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI13_0 +; CHECK-NEXT: adr.w lr, .LCPI13_1 +; CHECK-NEXT: adr r3, .LCPI13_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i8 q0, #0xa +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q4, [r0, q1] +; CHECK-NEXT: vldrb.u8 q5, [r0, q2] +; CHECK-NEXT: vldrb.u8 q6, [r0, q3] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i8 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i8 q6, q5, q6 +; CHECK-NEXT: vmul.i8 q5, q5, q0 +; CHECK-NEXT: vstrb.8 q5, [r1, q2] +; CHECK-NEXT: vstrb.8 q6, [r1, q3] +; CHECK-NEXT: vstrb.8 q4, [r1, q1] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI13_0: +; CHECK-NEXT: .byte 1 @ 0x1 +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 10 @ 0xa +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 16 @ 0x10 +; CHECK-NEXT: .byte 19 @ 0x13 +; CHECK-NEXT: .byte 22 @ 0x16 +; CHECK-NEXT: .byte 25 @ 0x19 +; CHECK-NEXT: .byte 28 @ 0x1c +; CHECK-NEXT: .byte 31 @ 0x1f +; CHECK-NEXT: .byte 34 @ 0x22 +; CHECK-NEXT: .byte 37 @ 0x25 +; CHECK-NEXT: .byte 40 @ 0x28 +; CHECK-NEXT: .byte 43 @ 0x2b +; CHECK-NEXT: .byte 46 @ 0x2e +; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +; CHECK-NEXT: .LCPI13_2: +; CHECK-NEXT: .byte 2 @ 0x2 +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i8* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i8, i8* %pointer.phi, <16 x i8> + %v3 = getelementptr i8, i8* %pointer.phi, i32 48 + %vector.gep56 = getelementptr i8, i8* %pointer.phi55, <16 x i8> + %v4 = getelementptr i8, i8* %pointer.phi55, i32 48 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i8, <16 x i8*> %vector.gep, i8 1 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %vector.gep, i32 1, <16 x i1> , <16 x i8> undef) + %v7 = getelementptr inbounds i8, <16 x i8*> %vector.gep, i8 2 + %wide.masked.gather57 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %v6, i32 1, <16 x i1> , <16 x i8> undef) + %wide.masked.gather58 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %v7, i32 1, <16 x i1> , <16 x i8> undef) + %v11 = mul nuw nsw <16 x i8> %wide.masked.gather, + %v13 = mul nuw nsw <16 x i8> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <16 x i8> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i8, <16 x i8*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v11, <16 x i8*> %vector.gep56, i32 1, <16 x i1> ) + %v18 = getelementptr inbounds i8, <16 x i8*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v13, <16 x i8*> %v17, i32 1, <16 x i1> ) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v15, <16 x i8*> %v18, i32 1, <16 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) + +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll index fe9e7d197ca26..2db5bf59ecfae 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll @@ -106,14 +106,12 @@ define void @arm_cmplx_dot_prod_q15(i16* nocapture readonly %pSrcA, i16* nocaptu ; CHECK-NEXT: wls lr, lr, .LBB1_7 ; CHECK-NEXT: .LBB1_5: @ %while.body11 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrsh.w r5, [r0, #2] -; CHECK-NEXT: ldrsh.w r6, [r1] -; CHECK-NEXT: ldrsh.w r9, [r0] -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: ldrsh.w r2, [r1, #2] -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: smlalbb r4, r11, r6, r5 +; CHECK-NEXT: ldrsh r9, [r0], #4 +; CHECK-NEXT: ldrsh r6, [r1], #4 +; CHECK-NEXT: ldrsh r5, [r0, #-2] +; CHECK-NEXT: ldrsh r2, [r1, #-2] ; CHECK-NEXT: smlalbb r12, r7, r6, r9 +; CHECK-NEXT: smlalbb r4, r11, r6, r5 ; CHECK-NEXT: muls r5, r2, r5 ; CHECK-NEXT: smlalbb r4, r11, r2, r9 ; CHECK-NEXT: subs.w r12, r12, r5 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll index df92d30da6af4..bf43a4956a539 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll @@ -1739,9 +1739,10 @@ define arm_aapcs_vfpcc <4 x i32> @icmp_slt_v4i32_y(<4 x i32> %x, <4 x i32> %y, i ; CHECK-LABEL: icmp_slt_v4i32_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.s32 gt, q1, q0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1755,9 +1756,10 @@ define arm_aapcs_vfpcc <8 x i16> @icmp_slt_v8i16_y(<8 x i16> %x, <8 x i16> %y, i ; CHECK-LABEL: icmp_slt_v8i16_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.s16 gt, q1, q0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1771,9 +1773,10 @@ define arm_aapcs_vfpcc <16 x i8> @icmp_slt_v16i8_y(<16 x i8> %x, <16 x i8> %y, i ; CHECK-LABEL: icmp_slt_v16i8_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.s8 gt, q1, q0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1787,9 +1790,10 @@ define arm_aapcs_vfpcc <4 x i32> @icmp_sgt_v4i32_y(<4 x i32> %x, <4 x i32> %y, i ; CHECK-LABEL: icmp_sgt_v4i32_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.s32 gt, q0, q1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1803,9 +1807,10 @@ define arm_aapcs_vfpcc <8 x i16> @icmp_sgt_v8i16_y(<8 x i16> %x, <8 x i16> %y, i ; CHECK-LABEL: icmp_sgt_v8i16_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.s16 gt, q0, q1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1819,9 +1824,10 @@ define arm_aapcs_vfpcc <16 x i8> @icmp_sgt_v16i8_y(<16 x i8> %x, <16 x i8> %y, i ; CHECK-LABEL: icmp_sgt_v16i8_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.s8 gt, q0, q1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1835,9 +1841,10 @@ define arm_aapcs_vfpcc <4 x i32> @icmp_ult_v4i32_y(<4 x i32> %x, <4 x i32> %y, i ; CHECK-LABEL: icmp_ult_v4i32_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.u32 hi, q1, q0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1851,9 +1858,10 @@ define arm_aapcs_vfpcc <8 x i16> @icmp_ult_v8i16_y(<8 x i16> %x, <8 x i16> %y, i ; CHECK-LABEL: icmp_ult_v8i16_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.u16 hi, q1, q0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1867,9 +1875,10 @@ define arm_aapcs_vfpcc <16 x i8> @icmp_ult_v16i8_y(<16 x i8> %x, <16 x i8> %y, i ; CHECK-LABEL: icmp_ult_v16i8_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.u8 hi, q1, q0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1883,9 +1892,10 @@ define arm_aapcs_vfpcc <4 x i32> @icmp_ugt_v4i32_y(<4 x i32> %x, <4 x i32> %y, i ; CHECK-LABEL: icmp_ugt_v4i32_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.u32 hi, q0, q1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1899,9 +1909,10 @@ define arm_aapcs_vfpcc <8 x i16> @icmp_ugt_v8i16_y(<8 x i16> %x, <8 x i16> %y, i ; CHECK-LABEL: icmp_ugt_v8i16_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.u16 hi, q0, q1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1915,9 +1926,10 @@ define arm_aapcs_vfpcc <16 x i8> @icmp_ugt_v16i8_y(<16 x i8> %x, <16 x i8> %y, i ; CHECK-LABEL: icmp_ugt_v16i8_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.8 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.u8 hi, q0, q1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n) @@ -1931,9 +1943,10 @@ define arm_aapcs_vfpcc <4 x float> @fcmp_fast_olt_v4f32_y(<4 x float> %x, <4 x f ; CHECK-LABEL: fcmp_fast_olt_v4f32_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.f32 gt, q1, q0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1947,9 +1960,10 @@ define arm_aapcs_vfpcc <8 x half> @fcmp_fast_olt_v8f16_y(<8 x half> %x, <8 x hal ; CHECK-LABEL: fcmp_fast_olt_v8f16_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.f16 gt, q1, q0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) @@ -1963,9 +1977,10 @@ define arm_aapcs_vfpcc <4 x float> @fcmp_fast_ogt_v4f32_y(<4 x float> %x, <4 x f ; CHECK-LABEL: fcmp_fast_ogt_v4f32_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.32 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.f32 gt, q0, q1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n) @@ -1979,9 +1994,10 @@ define arm_aapcs_vfpcc <8 x half> @fcmp_fast_ogt_v8f16_y(<8 x half> %x, <8 x hal ; CHECK-LABEL: fcmp_fast_ogt_v8f16_y: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vctp.16 r0 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vcmpt.f16 gt, q0, q1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n) diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll new file mode 100644 index 0000000000000..311a06a675771 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -verify-machineinstrs -o - | FileCheck %s + +define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %pResult, i32* nocapture %pIndex) { +; CHECK-LABEL: arm_min_helium_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: vidup.u32 q2, r6, #1 +; CHECK-NEXT: adr r4, .LCPI0_0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vmov.i32 q3, #0x4 +; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: dlstp.32 lr, r12 +; CHECK-NEXT: .LBB0_1: @ %do.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vcmp.f32 ge, q1, q4 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vmovt q1, q4 +; CHECK-NEXT: vmovt q0, q2 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %do.end +; CHECK-NEXT: vldr s8, .LCPI0_1 +; CHECK-NEXT: vdup.32 q3, r1 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vminnmv.f32 r0, q1 +; CHECK-NEXT: vcmp.f32 le, q1, r0 +; CHECK-NEXT: vmov s8, r0 +; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vminv.u32 r1, q0 +; CHECK-NEXT: str r1, [r3] +; CHECK-NEXT: vstr s8, [r2] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r6, r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 +; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 +; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 +; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 +entry: + %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 1) + %1 = extractvalue { <4 x i32>, i32 } %0, 0 + br label %do.body + +do.body: ; preds = %do.body, %entry + %curExtremValVec.0 = phi <4 x float> [ , %entry ], [ %8, %do.body ] + %indexVec.0 = phi <4 x i32> [ %1, %entry ], [ %11, %do.body ] + %2 = phi <4 x float> [ zeroinitializer, %entry ], [ %10, %do.body ] + %blkCnt.0 = phi i32 [ %blockSize, %entry ], [ %sub, %do.body ] + %pSrc.addr.0 = phi float* [ %pSrc, %entry ], [ %add.ptr, %do.body ] + %3 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0) + %4 = bitcast float* %pSrc.addr.0 to <4 x float>* + %5 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> zeroinitializer) + %6 = fcmp fast ole <4 x float> %5, %curExtremValVec.0 + %7 = and <4 x i1> %6, %3 + %8 = select fast <4 x i1> %7, <4 x float> %5, <4 x float> %curExtremValVec.0 + %9 = bitcast <4 x i32> %indexVec.0 to <4 x float> + %10 = select fast <4 x i1> %7, <4 x float> %9, <4 x float> %2 + %11 = add <4 x i32> %indexVec.0, + %add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4 + %sub = add nsw i32 %blkCnt.0, -4 + %cmp = icmp sgt i32 %blkCnt.0, 4 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + %12 = bitcast <4 x float> %10 to <4 x i32> + %13 = tail call fast float @llvm.arm.mve.minnmv.f32.v4f32(float 0x426D1A94A0000000, <4 x float> %8) + %.splatinsert = insertelement <4 x float> undef, float %13, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + %14 = fcmp fast ole <4 x float> %8, %.splat + %.splatinsert1 = insertelement <4 x i32> undef, i32 %blockSize, i32 0 + %.splat2 = shufflevector <4 x i32> %.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer + %15 = select <4 x i1> %14, <4 x i32> %12, <4 x i32> %.splat2 + %16 = tail call i32 @llvm.arm.mve.minv.v4i32(i32 %blockSize, <4 x i32> %15, i32 1) + store i32 %16, i32* %pIndex, align 4 + store float %13, float* %pResult, align 4 + ret void +} + +declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) #1 +declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2 +declare float @llvm.arm.mve.minnmv.f32.v4f32(float, <4 x float>) #1 +declare i32 @llvm.arm.mve.minv.v4i32(i32, <4 x i32>, i32) #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll index 2d9e2047e08cb..ac615a1d57a1c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -177,5 +177,75 @@ entry: ret void } +define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_i16_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vldrh.s32 q3, [r1] +; CHECK-NEXT: vmov.i32 q2, #0x28 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vshl.i32 q3, q3, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs2, i32 2, <8 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_i16_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs2, i32 2, <8 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll index 25a9cea5f5be9..1c9871e54b10d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -233,6 +233,58 @@ entry: ret void } +define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_scaled_i16_i32_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.i32 q1, #0xa +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs + %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep2(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_scaled_i16_i32_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI16_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> + %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll index d84066cdc5bdd..7fde69ef836c8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -534,6 +534,115 @@ entry: ret void } +define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrb.s32 q1, [r1, #12] +; CHECK-NEXT: vldrb.s32 q2, [r1, #8] +; CHECK-NEXT: vldrb.s32 q3, [r1, #4] +; CHECK-NEXT: vldrb.s32 q5, [r1] +; CHECK-NEXT: vmov.i32 q4, #0x5 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrb.8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll new file mode 100644 index 0000000000000..54249151d448e --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <4 x float> @foo_v4i16(<4 x i16>* nocapture readonly %pSrc, i32 %blockSize, <4 x i16> %a) { +; CHECK-LABEL: foo_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vpt.s32 lt, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: vcvt.f32.s32 q0, q0 +; CHECK-NEXT: bx lr +entry: + %active.lane.mask = icmp slt <4 x i16> %a, zeroinitializer + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %pSrc, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef) + %0 = sitofp <4 x i16> %wide.masked.load to <4 x float> + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <8 x half> @foo_v8i8(<8 x i8>* nocapture readonly %pSrc, i32 %blockSize, <8 x i8> %a) { +; CHECK-LABEL: foo_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vpt.s16 lt, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: vcvt.f16.s16 q0, q0 +; CHECK-NEXT: bx lr +entry: + %active.lane.mask = icmp slt <8 x i8> %a, zeroinitializer + %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %pSrc, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef) + %0 = sitofp <8 x i8> %wide.masked.load to <8 x half> + ret <8 x half> %0 +} + +define arm_aapcs_vfpcc <4 x float> @foo_v4i8(<4 x i8>* nocapture readonly %pSrc, i32 %blockSize, <4 x i8> %a) { +; CHECK-LABEL: foo_v4i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vpt.s32 lt, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: vcvt.f32.s32 q0, q0 +; CHECK-NEXT: bx lr +entry: + %active.lane.mask = icmp slt <4 x i8> %a, zeroinitializer + %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %pSrc, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef) + %0 = sitofp <4 x i8> %wide.masked.load to <4 x float> + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) { +; CHECK-LABEL: foo_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpt.s32 lt, q0, zr +; CHECK-NEXT: vldrwt.u32 q5, [r0] +; CHECK-NEXT: vmov.f64 d8, d10 +; CHECK-NEXT: vmov.f32 s18, s21 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: bl __aeabi_l2d +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov d9, r0, r1 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: bl __aeabi_l2d +; CHECK-NEXT: vmov.f64 d12, d11 +; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov d8, r0, r1 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: bl __aeabi_l2d +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov d11, r0, r1 +; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: bl __aeabi_l2d +; CHECK-NEXT: vmov d10, r0, r1 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +entry: + %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %0 = sitofp <4 x i32> %wide.masked.load to <4 x double> + ret <4 x double> %0 +} + +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) + +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) + +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vabd.ll b/llvm/test/CodeGen/Thumb2/mve-vabd.ll new file mode 100644 index 0000000000000..3bbf2fc23a1cf --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vabd.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP + +define arm_aapcs_vfpcc void @vabd_v4f32(<4 x float> %x, <4 x float> %y, <4 x float>* %z) { +; CHECK-MVE-LABEL: vabd_v4f32 +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-MVE-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-MVE-NEXT: .pad #4 +; CHECK-MVE-NEXT: sub sp, #4 +; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vmov q4, q1 +; CHECK-MVE-NEXT: vmov q5, q0 +; CHECK-MVE-NEXT: mov r4, r0 +; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: vmov r1, s16 +; CHECK-MVE-NEXT: bl __aeabi_fsub +; CHECK-MVE-NEXT: mov r5, r0 +; CHECK-MVE-NEXT: vmov r0, s21 + +; CHECK-MVEFP-LABEL: vabd_v4f32 +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vabd.f32 q0, q0, q1 +; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr +entry: + %0 = fsub <4 x float> %x, %y + %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %0) + store <4 x float> %1, <4 x float>* %z, align 4 + ret void +} + +define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, <8 x half>* %z) { +; CHECK-MVE-LABEL: vabd_v8f16 +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .save {r4, r5, r6, lr} +; CHECK-MVE-NEXT: push {r4, r5, r6, lr} +; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-MVE-NEXT: mov r4, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] +; CHECK-MVE-NEXT: vmov q5, q1 +; CHECK-MVE-NEXT: vmov q4, q0 +; CHECK-MVE-NEXT: bl __aeabi_h2f +; CHECK-MVE-NEXT: mov r5, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] + +; CHECK-MVEFP-LABEL: vabd_v8f16 +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vabd.f16 q0, q0, q1 +; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] +; CHECK-MVEFP-NEXT: bx lr +entry: + %0 = fsub <8 x half> %x, %y + %1 = call <8 x half> @llvm.fabs.v8f16(<8 x half> %0) + store <8 x half> %1, <8 x half>* %z + ret void +} + +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vctp.ll b/llvm/test/CodeGen/Thumb2/mve-vctp.ll index 67bc161e02c64..8cddbc79e2e13 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vctp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vctp.ll @@ -4,10 +4,11 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) { ; CHECK-LABEL: vctp8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vctp.8 r0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vctp.8 r0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r2] ; CHECK-NEXT: bx lr %pred = call <16 x i1> @llvm.arm.mve.vctp8(i32 %arg) @@ -20,10 +21,11 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) { define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) { ; CHECK-LABEL: vctp16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r2] ; CHECK-NEXT: bx lr %pred = call <8 x i1> @llvm.arm.mve.vctp16(i32 %arg) @@ -36,10 +38,11 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) { define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) { ; CHECK-LABEL: vctp32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r2] ; CHECK-NEXT: bx lr %pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %arg) diff --git a/llvm/test/CodeGen/Thumb2/postinc-distribute.mir b/llvm/test/CodeGen/Thumb2/postinc-distribute.mir index af39cced110a0..d08ac4754c703 100644 --- a/llvm/test/CodeGen/Thumb2/postinc-distribute.mir +++ b/llvm/test/CodeGen/Thumb2/postinc-distribute.mir @@ -12,6 +12,8 @@ define i32* @t2STRBi12(i32* %x, i32 %y) { unreachable } define i32* @storedadd(i32* %x, i32 %y) { unreachable } + define i32* @minsize2(i32* %x, i32 %y) minsize optsize { unreachable } + define i32* @minsize3(i32* %x, i32 %y) minsize optsize { unreachable } ... --- @@ -57,9 +59,8 @@ body: | ; CHECK-LABEL: name: t2LDRHi12 ; CHECK: liveins: $r0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[t2LDRHi12_:%[0-9]+]]:rgpr = t2LDRHi12 [[COPY]], 0, 14 /* CC::al */, $noreg :: (load 4) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[t2LDRH_POST:%[0-9]+]]:rgpr, [[t2LDRH_POST1:%[0-9]+]]:rgpr = t2LDRH_POST [[COPY]], 32, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: $r0 = COPY [[t2LDRH_POST1]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:rgpr = t2LDRHi12 %0, 0, 14, $noreg :: (load 4, align 4) @@ -84,9 +85,8 @@ body: | ; CHECK-LABEL: name: t2LDRSHi12 ; CHECK: liveins: $r0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[t2LDRSHi12_:%[0-9]+]]:rgpr = t2LDRSHi12 [[COPY]], 0, 14 /* CC::al */, $noreg :: (load 4) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[t2LDRSH_POST:%[0-9]+]]:rgpr, [[t2LDRSH_POST1:%[0-9]+]]:rgpr = t2LDRSH_POST [[COPY]], 32, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: $r0 = COPY [[t2LDRSH_POST1]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:rgpr = t2LDRSHi12 %0, 0, 14, $noreg :: (load 4, align 4) @@ -111,9 +111,8 @@ body: | ; CHECK-LABEL: name: t2LDRBi12 ; CHECK: liveins: $r0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[t2LDRBi12_:%[0-9]+]]:rgpr = t2LDRBi12 [[COPY]], 0, 14 /* CC::al */, $noreg :: (load 4) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[t2LDRB_POST:%[0-9]+]]:rgpr, [[t2LDRB_POST1:%[0-9]+]]:rgpr = t2LDRB_POST [[COPY]], 32, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: $r0 = COPY [[t2LDRB_POST1]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:rgpr = t2LDRBi12 %0, 0, 14, $noreg :: (load 4, align 4) @@ -138,9 +137,8 @@ body: | ; CHECK-LABEL: name: t2LDRSBi12 ; CHECK: liveins: $r0 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 - ; CHECK: [[t2LDRSBi12_:%[0-9]+]]:rgpr = t2LDRSBi12 [[COPY]], 0, 14 /* CC::al */, $noreg :: (load 4) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: [[t2LDRSB_POST:%[0-9]+]]:rgpr, [[t2LDRSB_POST1:%[0-9]+]]:rgpr = t2LDRSB_POST [[COPY]], 32, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: $r0 = COPY [[t2LDRSB_POST1]] ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:rgpr = t2LDRSBi12 %0, 0, 14, $noreg :: (load 4, align 4) @@ -197,9 +195,8 @@ body: | ; CHECK: liveins: $r0, $r1 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1 - ; CHECK: t2STRHi12 [[COPY1]], [[COPY]], 0, 14 /* CC::al */, $noreg :: (store 4) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: early-clobber %2:rgpr = t2STRH_POST [[COPY1]], [[COPY]], 32, 14 /* CC::al */, $noreg :: (store 4) + ; CHECK: $r0 = COPY %2 ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:rgpr = COPY $r1 @@ -227,9 +224,8 @@ body: | ; CHECK: liveins: $r0, $r1 ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1 - ; CHECK: t2STRBi12 [[COPY1]], [[COPY]], 0, 14 /* CC::al */, $noreg :: (store 4) - ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: early-clobber %2:rgpr = t2STRB_POST [[COPY1]], [[COPY]], 32, 14 /* CC::al */, $noreg :: (store 4) + ; CHECK: $r0 = COPY %2 ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 %0:gprnopc = COPY $r0 %1:rgpr = COPY $r1 @@ -265,3 +261,65 @@ body: | tBX_RET 14, $noreg, implicit $r0 ... +--- +name: minsize2 +tracksRegLiveness: true +registers: + - { id: 0, class: gprnopc, preferred-register: '' } + - { id: 1, class: rgpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } + - { id: 3, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } +body: | + bb.0: + liveins: $r0 + + ; CHECK-LABEL: name: minsize2 + ; CHECK: liveins: $r0 + ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 + ; CHECK: [[t2LDRB_POST:%[0-9]+]]:rgpr, [[t2LDRB_POST1:%[0-9]+]]:rgpr = t2LDRB_POST [[COPY]], 32, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: [[t2LDRBi8_:%[0-9]+]]:rgpr = t2LDRBi8 [[t2LDRB_POST1]], -30, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: $r0 = COPY [[t2LDRB_POST1]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:gprnopc = COPY $r0 + %1:rgpr = t2LDRBi12 %0, 0, 14, $noreg :: (load 4, align 4) + %3:rgpr = t2LDRBi12 %0, 2, 14, $noreg :: (load 4, align 4) + %2:rgpr = nuw t2ADDri %0, 32, 14, $noreg, $noreg + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... +--- +name: minsize3 +tracksRegLiveness: true +registers: + - { id: 0, class: gprnopc, preferred-register: '' } + - { id: 1, class: rgpr, preferred-register: '' } + - { id: 2, class: rgpr, preferred-register: '' } + - { id: 3, class: rgpr, preferred-register: '' } + - { id: 4, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } +body: | + bb.0: + liveins: $r0 + + ; CHECK-LABEL: name: minsize3 + ; CHECK: liveins: $r0 + ; CHECK: [[COPY:%[0-9]+]]:gprnopc = COPY $r0 + ; CHECK: [[t2LDRBi12_:%[0-9]+]]:rgpr = t2LDRBi12 [[COPY]], 0, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: [[t2LDRBi12_1:%[0-9]+]]:rgpr = t2LDRBi12 [[COPY]], 2, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: [[t2LDRBi12_2:%[0-9]+]]:rgpr = t2LDRBi12 [[COPY]], 4, 14 /* CC::al */, $noreg :: (load 4) + ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = nuw t2ADDri [[COPY]], 32, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r0 = COPY [[t2ADDri]] + ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + %0:gprnopc = COPY $r0 + %1:rgpr = t2LDRBi12 %0, 0, 14, $noreg :: (load 4, align 4) + %3:rgpr = t2LDRBi12 %0, 2, 14, $noreg :: (load 4, align 4) + %4:rgpr = t2LDRBi12 %0, 4, 14, $noreg :: (load 4, align 4) + %2:rgpr = nuw t2ADDri %0, 32, 14, $noreg, $noreg + $r0 = COPY %2 + tBX_RET 14, $noreg, implicit $r0 + +... diff --git a/llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir b/llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir index d0bcc666dfacb..1729a59819217 100644 --- a/llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir +++ b/llvm/test/CodeGen/Thumb2/t2-teq-reduce.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -run-pass=t2-reduce-size %s -o - | FileCheck %s +# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --- | target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir index 166914d5dedde..48b75ed5e3465 100644 --- a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir +++ b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -run-pass=t2-reduce-size %s -o - | FileCheck %s +# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --- | target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/CodeGen/VE/addition.ll b/llvm/test/CodeGen/VE/addition.ll index 730776ec534b8..54275e9e0e267 100644 --- a/llvm/test/CodeGen/VE/addition.ll +++ b/llvm/test/CodeGen/VE/addition.ll @@ -3,9 +3,11 @@ define signext i8 @func1(i8 signext %0, i8 signext %1) { ; CHECK-LABEL: func1: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s1, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %3 = add i8 %1, %0 ret i8 %3 @@ -14,9 +16,11 @@ define signext i8 @func1(i8 signext %0, i8 signext %1) { define signext i16 @func2(i16 signext %0, i16 signext %1) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s1, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %3 = add i16 %1, %0 ret i16 %3 @@ -25,6 +29,8 @@ define signext i16 @func2(i16 signext %0, i16 signext %1) { define i32 @func3(i32 %0, i32 %1) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s1, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %3 = add nsw i32 %1, %0 @@ -43,6 +49,8 @@ define i64 @func4(i64 %0, i64 %1) { define zeroext i8 @func6(i8 zeroext %0, i8 zeroext %1) { ; CHECK-LABEL: func6: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s1, %s0 ; CHECK-NEXT: and %s0, %s0, (56)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -53,6 +61,8 @@ define zeroext i8 @func6(i8 zeroext %0, i8 zeroext %1) { define zeroext i16 @func7(i16 zeroext %0, i16 zeroext %1) { ; CHECK-LABEL: func7: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s1, %s0 ; CHECK-NEXT: and %s0, %s0, (48)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -63,6 +73,8 @@ define zeroext i16 @func7(i16 zeroext %0, i16 zeroext %1) { define i32 @func8(i32 %0, i32 %1) { ; CHECK-LABEL: func8: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: adds.w.sx %s0, %s1, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %3 = add i32 %1, %0 @@ -81,9 +93,10 @@ define i64 @func9(i64 %0, i64 %1) { define signext i8 @func13(i8 signext %0) { ; CHECK-LABEL: func13: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, 5, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %2 = add i8 %0, 5 ret i8 %2 @@ -92,9 +105,10 @@ define signext i8 @func13(i8 signext %0) { define signext i16 @func14(i16 signext %0) { ; CHECK-LABEL: func14: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, 5, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %2 = add i16 %0, 5 ret i16 %2 @@ -103,6 +117,7 @@ define signext i16 @func14(i16 signext %0) { define i32 @func15(i32 %0) { ; CHECK-LABEL: func15: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, 5, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = add nsw i32 %0, 5 @@ -121,6 +136,7 @@ define i64 @func16(i64 %0) { define zeroext i8 @func18(i8 zeroext %0) { ; CHECK-LABEL: func18: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, 5, %s0 ; CHECK-NEXT: and %s0, %s0, (56)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -131,6 +147,7 @@ define zeroext i8 @func18(i8 zeroext %0) { define zeroext i16 @func19(i16 zeroext %0) { ; CHECK-LABEL: func19: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, 5, %s0 ; CHECK-NEXT: and %s0, %s0, (48)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -141,6 +158,7 @@ define zeroext i16 @func19(i16 zeroext %0) { define i32 @func20(i32 %0) { ; CHECK-LABEL: func20: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, 5, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = add i32 %0, 5 diff --git a/llvm/test/CodeGen/VE/bitcast.ll b/llvm/test/CodeGen/VE/bitcast.ll index dacc8f189e966..d7c09cd46b613 100644 --- a/llvm/test/CodeGen/VE/bitcast.ll +++ b/llvm/test/CodeGen/VE/bitcast.ll @@ -22,9 +22,8 @@ define dso_local double @bitcastl2d(i64 %x) { define dso_local float @bitcastw2f(i32 %x) { ; CHECK-LABEL: bitcastw2f: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sll %s0, %s0, 32 -; CHECK-NEXT: # kill: def $sf0 killed $sf0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = bitcast i32 %x to float ret float %r @@ -34,9 +33,7 @@ define dso_local float @bitcastw2f(i32 %x) { define dso_local i32 @bitcastf2w(float %x) { ; CHECK-LABEL: bitcastf2w: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf0 killed $sf0 def $sx0 ; CHECK-NEXT: sra.l %s0, %s0, 32 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = bitcast float %x to i32 ret i32 %r diff --git a/llvm/test/CodeGen/VE/bitreverse.ll b/llvm/test/CodeGen/VE/bitreverse.ll index fce969af657e2..af58afe38fd97 100644 --- a/llvm/test/CodeGen/VE/bitreverse.ll +++ b/llvm/test/CodeGen/VE/bitreverse.ll @@ -14,10 +14,8 @@ declare i64 @llvm.bitreverse.i64(i64) define i32 @func2(i32 %p) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 ; CHECK-NEXT: brv %s0, %s0 ; CHECK-NEXT: srl %s0, %s0, 32 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @llvm.bitreverse.i32(i32 %p) ret i32 %r @@ -28,10 +26,8 @@ declare i32 @llvm.bitreverse.i32(i32) define signext i16 @func3(i16 signext %p) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 ; CHECK-NEXT: brv %s0, %s0 ; CHECK-NEXT: sra.l %s0, %s0, 48 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i16 @llvm.bitreverse.i16(i16 %p) ret i16 %r @@ -42,10 +38,8 @@ declare i16 @llvm.bitreverse.i16(i16) define signext i8 @func4(i8 signext %p) { ; CHECK-LABEL: func4: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 ; CHECK-NEXT: brv %s0, %s0 ; CHECK-NEXT: sra.l %s0, %s0, 56 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i8 @llvm.bitreverse.i8(i8 %p) ret i8 %r @@ -65,10 +59,8 @@ define i64 @func5(i64 %p) { define i32 @func6(i32 %p) { ; CHECK-LABEL: func6: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 ; CHECK-NEXT: brv %s0, %s0 ; CHECK-NEXT: srl %s0, %s0, 32 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @llvm.bitreverse.i32(i32 %p) ret i32 %r @@ -77,10 +69,8 @@ define i32 @func6(i32 %p) { define zeroext i16 @func7(i16 zeroext %p) { ; CHECK-LABEL: func7: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 ; CHECK-NEXT: brv %s0, %s0 ; CHECK-NEXT: srl %s0, %s0, 48 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i16 @llvm.bitreverse.i16(i16 %p) ret i16 %r @@ -89,10 +79,8 @@ define zeroext i16 @func7(i16 zeroext %p) { define zeroext i8 @func8(i8 zeroext %p) { ; CHECK-LABEL: func8: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 ; CHECK-NEXT: brv %s0, %s0 ; CHECK-NEXT: srl %s0, %s0, 56 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i8 @llvm.bitreverse.i8(i8 %p) ret i8 %r diff --git a/llvm/test/CodeGen/VE/branch1.ll b/llvm/test/CodeGen/VE/branch1.ll index c9f0a22c4c0a0..5561284c992e6 100644 --- a/llvm/test/CodeGen/VE/branch1.ll +++ b/llvm/test/CodeGen/VE/branch1.ll @@ -3,19 +3,22 @@ define signext i8 @func1(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: func1: ; CHECK: .LBB{{[0-9]+}}_5: -; CHECK-NEXT: brle.w %s0, %s1, .LBB0_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: brle.w %s0, %s1, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB0_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = icmp sgt i8 %a, %b @@ -36,17 +39,20 @@ declare i32 @ret(i32) define i32 @func2(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_5: -; CHECK-NEXT: brle.w %s0, %s1, .LBB1_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: brle.w %s0, %s1, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB1_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = icmp sgt i16 %a, %b @@ -64,17 +70,20 @@ join: define i32 @func3(i32 %a, i32 %b) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_5: -; CHECK-NEXT: brle.w %s0, %s1, .LBB2_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: brle.w %s0, %s1, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB2_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = icmp sgt i32 %a, %b @@ -92,17 +101,18 @@ join: define i32 @func4(i64 %a, i64 %b) { ; CHECK-LABEL: func4: ; CHECK: .LBB{{[0-9]+}}_5: -; CHECK-NEXT: brle.l %s0, %s1, .LBB3_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: brle.l %s0, %s1, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB3_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = icmp sgt i64 %a, %b @@ -120,18 +130,21 @@ join: define i32 @func5(i8 zeroext %a, i8 zeroext %b) { ; CHECK-LABEL: func5: ; CHECK: .LBB{{[0-9]+}}_5: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: cmpu.w %s0, %s1, %s0 -; CHECK-NEXT: brle.w 0, %s0, .LBB4_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: brle.w 0, %s0, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB4_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = icmp ugt i8 %a, %b @@ -149,18 +162,21 @@ join: define i32 @func6(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: func6: ; CHECK: .LBB{{[0-9]+}}_5: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: cmpu.w %s0, %s1, %s0 -; CHECK-NEXT: brle.w 0, %s0, .LBB5_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: brle.w 0, %s0, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB5_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = icmp ugt i16 %a, %b @@ -178,18 +194,21 @@ join: define i32 @func7(i32 %a, i32 %b) { ; CHECK-LABEL: func7: ; CHECK: .LBB{{[0-9]+}}_5: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: cmpu.w %s0, %s1, %s0 -; CHECK-NEXT: brle.w 0, %s0, .LBB6_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: brle.w 0, %s0, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB6_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = icmp ugt i32 %a, %b @@ -207,17 +226,18 @@ join: define i32 @func8(float %a, float %b) { ; CHECK-LABEL: func8: ; CHECK: .LBB{{[0-9]+}}_5: -; CHECK-NEXT: brlenan.s %s0, %s1, .LBB7_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: brlenan.s %s0, %s1, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB7_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = fcmp ogt float %a, %b @@ -235,17 +255,18 @@ join: define i32 @func9(double %a, double %b) { ; CHECK-LABEL: func9: ; CHECK: .LBB{{[0-9]+}}_5: -; CHECK-NEXT: brlenan.d %s0, %s1, .LBB8_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: brlenan.d %s0, %s1, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB8_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = fcmp ogt double %a, %b @@ -264,17 +285,18 @@ define i32 @func10(double %a, double %b) { ; CHECK-LABEL: func10: ; CHECK: .LBB{{[0-9]+}}_5: ; CHECK-NEXT: lea.sl %s1, 1075052544 -; CHECK-NEXT: brlenan.d %s0, %s1, .LBB9_1 -; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: brlenan.d %s0, %s1, .LBB{{[0-9]+}}_1 +; CHECK-NEXT: # %bb.2: # %on.true ; CHECK-NEXT: lea %s0, ret@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, ret@hi(, %s0) ; CHECK-NEXT: or %s0, 2, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: br.l.t .LBB9_3 -; CHECK: .LBB{{[0-9]+}}_1: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: br.l.t .LBB{{[0-9]+}}_3 +; CHECK-NEXT: .LBB{{[0-9]+}}_1: ; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK: .LBB{{[0-9]+}}_3: +; CHECK-NEXT: .LBB{{[0-9]+}}_3: # %join ; CHECK-NEXT: or %s11, 0, %s9 entry: %cmp = fcmp ogt double %a, 5.000000e+00 diff --git a/llvm/test/CodeGen/VE/bswap.ll b/llvm/test/CodeGen/VE/bswap.ll index 274085462856f..39569d8889c5a 100644 --- a/llvm/test/CodeGen/VE/bswap.ll +++ b/llvm/test/CodeGen/VE/bswap.ll @@ -14,9 +14,8 @@ declare i64 @llvm.bswap.i64(i64) define i32 @func2(i32 %p) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: bswp %s0, %s0, 1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @llvm.bswap.i32(i32 %p) ret i32 %r @@ -27,9 +26,12 @@ declare i32 @llvm.bswap.i32(i32) define signext i16 @func3(i16 signext %p) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: bswp %s0, %s0, 1 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: srl %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i16 @llvm.bswap.i16(i16 %p) ret i16 %r @@ -49,9 +51,8 @@ define i64 @func4(i64 %p) { define i32 @func5(i32 %p) { ; CHECK-LABEL: func5: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: bswp %s0, %s0, 1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @llvm.bswap.i32(i32 %p) ret i32 %r @@ -60,11 +61,11 @@ define i32 @func5(i32 %p) { define zeroext i16 @func6(i16 zeroext %p) { ; CHECK-LABEL: func6: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: bswp %s0, %s0, 1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: srl %s0, %s0, 16 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i16 @llvm.bswap.i16(i16 %p) ret i16 %r diff --git a/llvm/test/CodeGen/VE/call.ll b/llvm/test/CodeGen/VE/call.ll index 9e9f22b6d8233..386a5fd74bf97 100644 --- a/llvm/test/CodeGen/VE/call.ll +++ b/llvm/test/CodeGen/VE/call.ll @@ -20,7 +20,7 @@ define i32 @stack_call_int() { ; CHECK-LABEL: stack_call_int: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s0, 10, (0)1 -; CHECK-NEXT: stl %s0, 248(, %s11) +; CHECK-NEXT: st %s0, 248(, %s11) ; CHECK-NEXT: or %s34, 9, (0)1 ; CHECK-NEXT: lea %s0, stack_callee_int@lo ; CHECK-NEXT: and %s0, %s0, (32)0 @@ -33,7 +33,7 @@ define i32 @stack_call_int() { ; CHECK-NEXT: or %s5, 6, (0)1 ; CHECK-NEXT: or %s6, 7, (0)1 ; CHECK-NEXT: or %s7, 8, (0)1 -; CHECK-NEXT: stl %s34, 240(, %s11) +; CHECK-NEXT: st %s34, 240(, %s11) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @stack_callee_int(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10) @@ -46,7 +46,7 @@ define i32 @stack_call_int_szext() { ; CHECK-LABEL: stack_call_int_szext: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s0, -1, (0)1 -; CHECK-NEXT: stl %s0, 248(, %s11) +; CHECK-NEXT: st %s0, 248(, %s11) ; CHECK-NEXT: lea %s34, 65535 ; CHECK-NEXT: lea %s1, stack_callee_int_szext@lo ; CHECK-NEXT: and %s1, %s1, (32)0 @@ -58,7 +58,7 @@ define i32 @stack_call_int_szext() { ; CHECK-NEXT: or %s5, 6, (0)1 ; CHECK-NEXT: or %s6, 7, (0)1 ; CHECK-NEXT: or %s7, 8, (0)1 -; CHECK-NEXT: stl %s34, 240(, %s11) +; CHECK-NEXT: st %s34, 240(, %s11) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @stack_callee_int_szext(i1 -1, i8 -1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i16 -1, i8 -1) @@ -70,12 +70,9 @@ declare i32 @stack_callee_int_szext(i1 signext, i8 zeroext, i32, i32, i32, i32, define float @stack_call_float() { ; CHECK-LABEL: stack_call_float: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: lea %s0, 1092616192 -; CHECK-NEXT: stl %s0, 252(, %s11) -; CHECK-NEXT: lea %s34, 1091567616 -; CHECK-NEXT: lea %s0, stack_callee_float@lo -; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, stack_callee_float@hi(, %s0) +; CHECK-NEXT: lea.sl %s0, 1092616192 +; CHECK-NEXT: st %s0, 248(, %s11) +; CHECK-NEXT: lea.sl %s34, 1091567616 ; CHECK-NEXT: lea.sl %s0, 1065353216 ; CHECK-NEXT: lea.sl %s1, 1073741824 ; CHECK-NEXT: lea.sl %s2, 1077936128 @@ -84,15 +81,10 @@ define float @stack_call_float() { ; CHECK-NEXT: lea.sl %s5, 1086324736 ; CHECK-NEXT: lea.sl %s6, 1088421888 ; CHECK-NEXT: lea.sl %s7, 1090519040 -; CHECK-NEXT: stl %s34, 244(, %s11) -; CHECK-NEXT: # kill: def $sf0 killed $sf0 killed $sx0 -; CHECK-NEXT: # kill: def $sf1 killed $sf1 killed $sx1 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 killed $sx2 -; CHECK-NEXT: # kill: def $sf3 killed $sf3 killed $sx3 -; CHECK-NEXT: # kill: def $sf4 killed $sf4 killed $sx4 -; CHECK-NEXT: # kill: def $sf5 killed $sf5 killed $sx5 -; CHECK-NEXT: # kill: def $sf6 killed $sf6 killed $sx6 -; CHECK-NEXT: # kill: def $sf7 killed $sf7 killed $sx7 +; CHECK-NEXT: lea %s35, stack_callee_float@lo +; CHECK-NEXT: and %s35, %s35, (32)0 +; CHECK-NEXT: lea.sl %s12, stack_callee_float@hi(, %s35) +; CHECK-NEXT: st %s34, 240(, %s11) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call float @stack_callee_float(float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0) @@ -104,11 +96,11 @@ declare float @stack_callee_float(float, float, float, float, float, float, floa define float @stack_call_float2(float %p0) { ; CHECK-LABEL: stack_call_float2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: stu %s0, 252(, %s11) +; CHECK-NEXT: st %s0, 248(, %s11) ; CHECK-NEXT: lea %s1, stack_callee_float@lo ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea.sl %s12, stack_callee_float@hi(, %s1) -; CHECK-NEXT: stu %s0, 244(, %s11) +; CHECK-NEXT: st %s0, 240(, %s11) ; CHECK-NEXT: or %s1, 0, %s0 ; CHECK-NEXT: or %s2, 0, %s0 ; CHECK-NEXT: or %s3, 0, %s0 diff --git a/llvm/test/CodeGen/VE/cast.ll b/llvm/test/CodeGen/VE/cast.ll index 51126e123ac67..07ad969a1bd3e 100644 --- a/llvm/test/CodeGen/VE/cast.ll +++ b/llvm/test/CodeGen/VE/cast.ll @@ -4,6 +4,7 @@ define i32 @i() { ; CHECK-LABEL: i: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea %s0, -2147483648 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s11, 0, %s9 ret i32 -2147483648 } @@ -12,6 +13,7 @@ define i32 @ui() { ; CHECK-LABEL: ui: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea %s0, -2147483648 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s11, 0, %s9 ret i32 -2147483648 } @@ -37,6 +39,7 @@ define signext i8 @d2c(double %x) { ; CHECK-LABEL: d2c: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.d.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptosi double %x to i8 ret i8 %r @@ -46,6 +49,7 @@ define zeroext i8 @d2uc(double %x) { ; CHECK-LABEL: d2uc: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.d.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptoui double %x to i8 ret i8 %r @@ -55,6 +59,7 @@ define signext i16 @d2s(double %x) { ; CHECK-LABEL: d2s: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.d.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptosi double %x to i16 ret i16 %r @@ -64,6 +69,7 @@ define zeroext i16 @d2us(double %x) { ; CHECK-LABEL: d2us: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.d.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptoui double %x to i16 ret i16 %r @@ -82,7 +88,6 @@ define i32 @d2ui(double %x) { ; CHECK-LABEL: d2ui: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.l.d.rz %s0, %s0 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptoui double %x to i32 ret i32 %r @@ -133,6 +138,7 @@ define signext i8 @f2c(float %x) { ; CHECK-LABEL: f2c: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.s.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptosi float %x to i8 ret i8 %r @@ -142,6 +148,7 @@ define zeroext i8 @f2uc(float %x) { ; CHECK-LABEL: f2uc: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.s.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptoui float %x to i8 ret i8 %r @@ -151,6 +158,7 @@ define signext i16 @f2s(float %x) { ; CHECK-LABEL: f2s: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.s.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptosi float %x to i16 ret i16 %r @@ -160,6 +168,7 @@ define zeroext i16 @f2us(float %x) { ; CHECK-LABEL: f2us: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.s.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptoui float %x to i16 ret i16 %r @@ -179,7 +188,6 @@ define i32 @f2ui(float %x) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.d.s %s0, %s0 ; CHECK-NEXT: cvt.l.d.rz %s0, %s0 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = fptoui float %x to i32 ret i32 %r @@ -234,7 +242,6 @@ define signext i8 @ll2c(i64 %0) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: sll %s0, %s0, 56 ; CHECK-NEXT: sra.l %s0, %s0, 56 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i64 %0 to i8 ret i8 %2 @@ -254,7 +261,6 @@ define signext i16 @ll2s(i64 %0) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: sll %s0, %s0, 48 ; CHECK-NEXT: sra.l %s0, %s0, 48 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i64 %0 to i16 ret i16 %2 @@ -272,7 +278,6 @@ define zeroext i16 @ll2us(i64 %0) { define i32 @ll2i(i64 %0) { ; CHECK-LABEL: ll2i: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i64 %0 to i32 ret i32 %2 @@ -281,7 +286,6 @@ define i32 @ll2i(i64 %0) { define i32 @ll2ui(i64 %0) { ; CHECK-LABEL: ll2ui: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i64 %0 to i32 ret i32 %2 @@ -325,7 +329,6 @@ define signext i8 @ull2c(i64 %0) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: sll %s0, %s0, 56 ; CHECK-NEXT: sra.l %s0, %s0, 56 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i64 %0 to i8 ret i8 %2 @@ -345,7 +348,6 @@ define signext i16 @ull2s(i64 %0) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: sll %s0, %s0, 48 ; CHECK-NEXT: sra.l %s0, %s0, 48 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i64 %0 to i16 ret i16 %2 @@ -363,7 +365,6 @@ define zeroext i16 @ull2us(i64 %0) { define i32 @ull2i(i64 %0) { ; CHECK-LABEL: ull2i: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i64 %0 to i32 ret i32 %2 @@ -372,7 +373,6 @@ define i32 @ull2i(i64 %0) { define i32 @ull2ui(i64 %0) { ; CHECK-LABEL: ull2ui: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i64 %0 to i32 ret i32 %2 @@ -433,8 +433,8 @@ define double @ull2d(i64 %x) { define signext i8 @i2c(i32 %0) { ; CHECK-LABEL: i2c: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i32 %0 to i8 ret i8 %2 @@ -452,8 +452,8 @@ define zeroext i8 @i2uc(i32 %0) { define signext i16 @i2s(i32 %0) { ; CHECK-LABEL: i2s: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i32 %0 to i16 ret i16 %2 @@ -503,6 +503,7 @@ define i64 @i2ull(i32 %0) { define float @i2f(i32 %x) { ; CHECK-LABEL: i2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = sitofp i32 %x to float @@ -512,6 +513,7 @@ define float @i2f(i32 %x) { define double @i2d(i32 %x) { ; CHECK-LABEL: i2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = sitofp i32 %x to double @@ -521,8 +523,8 @@ define double @i2d(i32 %x) { define signext i8 @ui2c(i32 %0) { ; CHECK-LABEL: ui2c: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i32 %0 to i8 ret i8 %2 @@ -540,8 +542,8 @@ define zeroext i8 @ui2uc(i32 %0) { define signext i16 @ui2s(i32 %0) { ; CHECK-LABEL: ui2s: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i32 %0 to i16 ret i16 %2 @@ -573,7 +575,7 @@ define i32 @ui2ui(i32 returned %0) { define i64 @ui2ll(i32 %0) { ; CHECK-LABEL: ui2ll: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i32 %0 to i64 ret i64 %2 @@ -582,7 +584,7 @@ define i64 @ui2ll(i32 %0) { define i64 @ui2ull(i32 %0) { ; CHECK-LABEL: ui2ull: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i32 %0 to i64 ret i64 %2 @@ -591,7 +593,7 @@ define i64 @ui2ull(i32 %0) { define float @ui2f(i32 %x) { ; CHECK-LABEL: ui2f: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: cvt.d.l %s0, %s0 ; CHECK-NEXT: cvt.s.d %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -602,7 +604,7 @@ define float @ui2f(i32 %x) { define double @ui2d(i32 %x) { ; CHECK-LABEL: ui2d: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: cvt.d.l %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = uitofp i32 %x to double @@ -612,8 +614,8 @@ define double @ui2d(i32 %x) { define signext i8 @s2c(i16 signext %0) { ; CHECK-LABEL: s2c: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i16 %0 to i8 ret i8 %2 @@ -662,7 +664,6 @@ define i32 @s2ui(i16 signext %0) { define i64 @s2ll(i16 signext %0) { ; CHECK-LABEL: s2ll: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = sext i16 %0 to i64 ret i64 %2 @@ -671,7 +672,6 @@ define i64 @s2ll(i16 signext %0) { define i64 @s2ull(i16 signext %0) { ; CHECK-LABEL: s2ull: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = sext i16 %0 to i64 ret i64 %2 @@ -680,6 +680,7 @@ define i64 @s2ull(i16 signext %0) { define float @s2f(i16 signext %x) { ; CHECK-LABEL: s2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = sitofp i16 %x to float @@ -689,6 +690,7 @@ define float @s2f(i16 signext %x) { define double @s2d(i16 signext %x) { ; CHECK-LABEL: s2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = sitofp i16 %x to double @@ -698,8 +700,8 @@ define double @s2d(i16 signext %x) { define signext i8 @us2c(i16 zeroext %0) { ; CHECK-LABEL: us2c: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i16 %0 to i8 ret i8 %2 @@ -717,8 +719,8 @@ define zeroext i8 @us2uc(i16 zeroext %0) { define signext i16 @us2s(i16 returned zeroext %0) { ; CHECK-LABEL: us2s: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 ret i16 %0 } @@ -749,7 +751,6 @@ define i32 @us2ui(i16 zeroext %0) { define i64 @us2ll(i16 zeroext %0) { ; CHECK-LABEL: us2ll: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i16 %0 to i64 ret i64 %2 @@ -758,7 +759,6 @@ define i64 @us2ll(i16 zeroext %0) { define i64 @us2ull(i16 zeroext %0) { ; CHECK-LABEL: us2ull: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i16 %0 to i64 ret i64 %2 @@ -767,6 +767,7 @@ define i64 @us2ull(i16 zeroext %0) { define float @us2f(i16 zeroext %x) { ; CHECK-LABEL: us2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = uitofp i16 %x to float @@ -776,6 +777,7 @@ define float @us2f(i16 zeroext %x) { define double @us2d(i16 zeroext %x) { ; CHECK-LABEL: us2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = uitofp i16 %x to double @@ -833,7 +835,6 @@ define i32 @c2ui(i8 signext %0) { define i64 @c2ll(i8 signext %0) { ; CHECK-LABEL: c2ll: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = sext i8 %0 to i64 ret i64 %2 @@ -842,7 +843,6 @@ define i64 @c2ll(i8 signext %0) { define i64 @c2ull(i8 signext %0) { ; CHECK-LABEL: c2ull: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = sext i8 %0 to i64 ret i64 %2 @@ -851,6 +851,7 @@ define i64 @c2ull(i8 signext %0) { define float @c2f(i8 signext %x) { ; CHECK-LABEL: c2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = sitofp i8 %x to float @@ -860,6 +861,7 @@ define float @c2f(i8 signext %x) { define double @c2d(i8 signext %x) { ; CHECK-LABEL: c2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = sitofp i8 %x to double @@ -869,8 +871,8 @@ define double @c2d(i8 signext %x) { define signext i8 @uc2c(i8 returned zeroext %0) { ; CHECK-LABEL: uc2c: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 ret i8 %0 } @@ -917,7 +919,6 @@ define i32 @uc2ui(i8 zeroext %0) { define i64 @uc2ll(i8 zeroext %0) { ; CHECK-LABEL: uc2ll: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i8 %0 to i64 ret i64 %2 @@ -926,7 +927,6 @@ define i64 @uc2ll(i8 zeroext %0) { define i64 @uc2ull(i8 zeroext %0) { ; CHECK-LABEL: uc2ull: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i8 %0 to i64 ret i64 %2 @@ -935,6 +935,7 @@ define i64 @uc2ull(i8 zeroext %0) { define float @uc2f(i8 zeroext %x) { ; CHECK-LABEL: uc2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = uitofp i8 %x to float @@ -944,6 +945,7 @@ define float @uc2f(i8 zeroext %x) { define double @uc2d(i8 zeroext %x) { ; CHECK-LABEL: uc2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = uitofp i8 %x to double @@ -976,7 +978,6 @@ define signext i8 @i1282c(i128 %0) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: sll %s0, %s0, 56 ; CHECK-NEXT: sra.l %s0, %s0, 56 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i128 %0 to i8 ret i8 %2 @@ -988,7 +989,6 @@ define signext i8 @ui1282c(i128 %0) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: sll %s0, %s0, 56 ; CHECK-NEXT: sra.l %s0, %s0, 56 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i128 %0 to i8 ret i8 %2 @@ -1020,7 +1020,6 @@ define signext i16 @i1282s(i128 %0) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: sll %s0, %s0, 48 ; CHECK-NEXT: sra.l %s0, %s0, 48 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i128 %0 to i16 ret i16 %2 @@ -1032,7 +1031,6 @@ define signext i16 @ui1282s(i128 %0) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: sll %s0, %s0, 48 ; CHECK-NEXT: sra.l %s0, %s0, 48 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i128 %0 to i16 ret i16 %2 @@ -1062,7 +1060,6 @@ define zeroext i16 @ui1282us(i128 %0) { define i32 @i1282i(i128 %0) { ; CHECK-LABEL: i1282i: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i128 %0 to i32 ret i32 %2 @@ -1072,7 +1069,6 @@ define i32 @i1282i(i128 %0) { define i32 @ui1282i(i128 %0) { ; CHECK-LABEL: ui1282i: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i128 %0 to i32 ret i32 %2 @@ -1082,7 +1078,6 @@ define i32 @ui1282i(i128 %0) { define i32 @i1282ui(i128 %0) { ; CHECK-LABEL: i1282ui: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i128 %0 to i32 ret i32 %2 @@ -1092,7 +1087,6 @@ define i32 @i1282ui(i128 %0) { define i32 @ui1282ui(i128 %0) { ; CHECK-LABEL: ui1282ui: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = trunc i128 %0 to i32 ret i32 %2 @@ -1216,7 +1210,7 @@ define i128 @i2ui128(i32 %0) { define i128 @ui2i128(i32 %0) { ; CHECK-LABEL: ui2i128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s1, 0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i32 %0 to i128 @@ -1227,7 +1221,7 @@ define i128 @ui2i128(i32 %0) { define i128 @ui2ui128(i32 %0) { ; CHECK-LABEL: ui2ui128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s1, 0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i32 %0 to i128 @@ -1238,7 +1232,6 @@ define i128 @ui2ui128(i32 %0) { define i128 @s2i128(i16 signext %0) { ; CHECK-LABEL: s2i128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.l %s1, %s0, 63 ; CHECK-NEXT: or %s11, 0, %s9 %2 = sext i16 %0 to i128 @@ -1249,7 +1242,6 @@ define i128 @s2i128(i16 signext %0) { define i128 @s2ui128(i16 signext %0) { ; CHECK-LABEL: s2ui128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.l %s1, %s0, 63 ; CHECK-NEXT: or %s11, 0, %s9 %2 = sext i16 %0 to i128 @@ -1260,7 +1252,6 @@ define i128 @s2ui128(i16 signext %0) { define i128 @us2i128(i16 zeroext %0) { ; CHECK-LABEL: us2i128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i16 %0 to i128 @@ -1271,7 +1262,6 @@ define i128 @us2i128(i16 zeroext %0) { define i128 @us2ui128(i16 zeroext %0) { ; CHECK-LABEL: us2ui128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i16 %0 to i128 @@ -1282,7 +1272,6 @@ define i128 @us2ui128(i16 zeroext %0) { define i128 @c2i128(i8 signext %0) { ; CHECK-LABEL: c2i128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.l %s1, %s0, 63 ; CHECK-NEXT: or %s11, 0, %s9 %2 = sext i8 %0 to i128 @@ -1293,7 +1282,6 @@ define i128 @c2i128(i8 signext %0) { define i128 @char2ui128(i8 signext %0) { ; CHECK-LABEL: char2ui128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.l %s1, %s0, 63 ; CHECK-NEXT: or %s11, 0, %s9 %2 = sext i8 %0 to i128 @@ -1304,7 +1292,6 @@ define i128 @char2ui128(i8 signext %0) { define i128 @uc2i128(i8 zeroext %0) { ; CHECK-LABEL: uc2i128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i8 %0 to i128 @@ -1315,7 +1302,6 @@ define i128 @uc2i128(i8 zeroext %0) { define i128 @uc2ui128(i8 zeroext %0) { ; CHECK-LABEL: uc2ui128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = zext i8 %0 to i128 diff --git a/llvm/test/CodeGen/VE/constants.ll b/llvm/test/CodeGen/VE/constants.ll index b7a43605ae347..baebf5ef3621e 100644 --- a/llvm/test/CodeGen/VE/constants.ll +++ b/llvm/test/CodeGen/VE/constants.ll @@ -304,7 +304,6 @@ define float @m5f32() { ; CHECK-LABEL: m5f32: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s0, -1063256064 -; CHECK-NEXT: # kill: def $sf0 killed $sf0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 ret float -5.000000e+00 } @@ -321,7 +320,6 @@ define float @p2p3f32() { ; CHECK-LABEL: p2p3f32: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s0, 1075000115 -; CHECK-NEXT: # kill: def $sf0 killed $sf0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 ret float 0x4002666660000000 ; 2.3 } @@ -339,7 +337,6 @@ define float @p128p3f32() { ; CHECK-LABEL: p128p3f32: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s0, 1124093133 -; CHECK-NEXT: # kill: def $sf0 killed $sf0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 ret float 0x40600999A0000000 ; 128.3 } diff --git a/llvm/test/CodeGen/VE/ctlz.ll b/llvm/test/CodeGen/VE/ctlz.ll index de44790014a07..5853851ac9c97 100644 --- a/llvm/test/CodeGen/VE/ctlz.ll +++ b/llvm/test/CodeGen/VE/ctlz.ll @@ -14,10 +14,9 @@ declare i64 @llvm.ctlz.i64(i64, i1) define i32 @func2(i32 %p) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sll %s0, %s0, 32 ; CHECK-NEXT: ldz %s0, %s0 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @llvm.ctlz.i32(i32 %p, i1 true) ret i32 %r diff --git a/llvm/test/CodeGen/VE/ctpop.ll b/llvm/test/CodeGen/VE/ctpop.ll index 3d25909ab25cb..8fee9104ed077 100644 --- a/llvm/test/CodeGen/VE/ctpop.ll +++ b/llvm/test/CodeGen/VE/ctpop.ll @@ -14,10 +14,9 @@ declare i64 @llvm.ctpop.i64(i64 %p) define i32 @func2(i32 %p) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: pcnt %s0, %s0 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @llvm.ctpop.i32(i32 %p) ret i32 %r @@ -29,9 +28,7 @@ define i16 @func3(i16 %p) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: and %s0, %s0, (48)0 -; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: pcnt %s0, %s0 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i16 @llvm.ctpop.i16(i16 %p) ret i16 %r @@ -43,9 +40,7 @@ define i8 @func4(i8 %p) { ; CHECK-LABEL: func4: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: and %s0, %s0, (56)0 -; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: pcnt %s0, %s0 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i8 @llvm.ctpop.i8(i8 %p) ret i8 %r diff --git a/llvm/test/CodeGen/VE/cttz.ll b/llvm/test/CodeGen/VE/cttz.ll index 4b79a0f988e86..46bb52d29102b 100644 --- a/llvm/test/CodeGen/VE/cttz.ll +++ b/llvm/test/CodeGen/VE/cttz.ll @@ -16,12 +16,12 @@ declare i64 @llvm.cttz.i64(i64, i1) define i32 @func2(i32 %p) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s1, -1, %s0 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, %s1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: pcnt %s0, %s0 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i32 @llvm.cttz.i32(i32 %p, i1 true) ret i32 %r @@ -32,12 +32,12 @@ declare i32 @llvm.cttz.i32(i32, i1) define i16 @func3(i16 %p) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s1, -1, %s0 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, %s1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: pcnt %s0, %s0 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i16 @llvm.cttz.i16(i16 %p, i1 true) ret i16 %r @@ -48,12 +48,12 @@ declare i16 @llvm.cttz.i16(i16, i1) define i8 @func4(i8 %p) { ; CHECK-LABEL: func4: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s1, -1, %s0 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, %s1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: pcnt %s0, %s0 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %r = tail call i8 @llvm.cttz.i8(i8 %p, i1 true) ret i8 %r diff --git a/llvm/test/CodeGen/VE/div.ll b/llvm/test/CodeGen/VE/div.ll index 8d4a0ddd2c156..ed434a9a3c7b5 100644 --- a/llvm/test/CodeGen/VE/div.ll +++ b/llvm/test/CodeGen/VE/div.ll @@ -14,6 +14,8 @@ define i64 @divi64(i64 %a, i64 %b) { define i32 @divi32(i32 %a, i32 %b) { ; CHECK-LABEL: divi32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divs.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %r = sdiv i32 %a, %b @@ -34,6 +36,8 @@ define i64 @divu64(i64 %a, i64 %b) { define i32 @divu32(i32 %a, i32 %b) { ; CHECK-LABEL: divu32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divu.w %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %r = udiv i32 %a, %b @@ -44,9 +48,11 @@ define i32 @divu32(i32 %a, i32 %b) { define signext i16 @divi16(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: divi16: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divs.w.sx %s0, %s0, %s1 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %a32 = sext i16 %a to i32 %b32 = sext i16 %b to i32 @@ -59,7 +65,10 @@ define signext i16 @divi16(i16 signext %a, i16 signext %b) { define zeroext i16 @divu16(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: divu16: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divu.w %s0, %s0, %s1 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = udiv i16 %a, %b ret i16 %r @@ -69,9 +78,11 @@ define zeroext i16 @divu16(i16 zeroext %a, i16 zeroext %b) { define signext i8 @divi8(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: divi8: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divs.w.sx %s0, %s0, %s1 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %a32 = sext i8 %a to i32 %b32 = sext i8 %b to i32 @@ -84,7 +95,10 @@ define signext i8 @divi8(i8 signext %a, i8 signext %b) { define zeroext i8 @divu8(i8 zeroext %a, i8 zeroext %b) { ; CHECK-LABEL: divu8: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divu.w %s0, %s0, %s1 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = udiv i8 %a, %b ret i8 %r @@ -104,6 +118,7 @@ define i64 @divi64ri(i64 %a, i64 %b) { define i32 @divi32ri(i32 %a, i32 %b) { ; CHECK-LABEL: divi32ri: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divs.w.sx %s0, %s0, (62)0 ; CHECK-NEXT: or %s11, 0, %s9 %r = sdiv i32 %a, 3 @@ -124,6 +139,7 @@ define i64 @divu64ri(i64 %a, i64 %b) { define i32 @divu32ri(i32 %a, i32 %b) { ; CHECK-LABEL: divu32ri: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divu.w %s0, %s0, (62)0 ; CHECK-NEXT: or %s11, 0, %s9 %r = udiv i32 %a, 3 @@ -144,7 +160,8 @@ define i64 @divi64li(i64 %a, i64 %b) { define i32 @divi32li(i32 %a, i32 %b) { ; CHECK-LABEL: divi32li: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: divs.w.sx %s0, 3, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 +; CHECK-NEXT: divs.w.sx %s0, 3, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = sdiv i32 3, %b ret i32 %r @@ -164,7 +181,8 @@ define i64 @divu64li(i64 %a, i64 %b) { define i32 @divu32li(i32 %a, i32 %b) { ; CHECK-LABEL: divu32li: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: divu.w %s0, 3, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 +; CHECK-NEXT: divu.w %s0, 3, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = udiv i32 3, %b ret i32 %r diff --git a/llvm/test/CodeGen/VE/fp_to_int.ll b/llvm/test/CodeGen/VE/fp_to_int.ll index 9a1a7e35c1190..d9b1926ea9d5d 100644 --- a/llvm/test/CodeGen/VE/fp_to_int.ll +++ b/llvm/test/CodeGen/VE/fp_to_int.ll @@ -5,6 +5,7 @@ define signext i8 @f2c(float %a) { ; CHECK-LABEL: f2c: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.s.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptosi float %a to i8 @@ -16,6 +17,7 @@ define signext i16 @f2s(float %a) { ; CHECK-LABEL: f2s: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.s.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptosi float %a to i16 @@ -50,6 +52,7 @@ define zeroext i8 @f2uc(float %a) { ; CHECK-LABEL: f2uc: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.s.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptoui float %a to i8 @@ -61,6 +64,7 @@ define zeroext i16 @f2us(float %a) { ; CHECK-LABEL: f2us: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.s.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptoui float %a to i16 @@ -73,7 +77,6 @@ define i32 @f2ui(float %a) { ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.d.s %s0, %s0 ; CHECK-NEXT: cvt.l.d.rz %s0, %s0 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptoui float %a to i32 @@ -105,6 +108,7 @@ define signext i8 @d2c(double %a) { ; CHECK-LABEL: d2c: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.d.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptosi double %a to i8 @@ -116,6 +120,7 @@ define signext i16 @d2s(double %a) { ; CHECK-LABEL: d2s: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.d.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptosi double %a to i16 @@ -149,6 +154,7 @@ define zeroext i8 @d2uc(double %a) { ; CHECK-LABEL: d2uc: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.d.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptoui double %a to i8 @@ -160,6 +166,7 @@ define zeroext i16 @d2us(double %a) { ; CHECK-LABEL: d2us: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.w.d.sx.rz %s0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptoui double %a to i16 @@ -171,7 +178,6 @@ define i32 @d2ui(double %a) { ; CHECK-LABEL: d2ui: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: cvt.l.d.rz %s0, %s0 -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 entry: %conv = fptoui double %a to i32 diff --git a/llvm/test/CodeGen/VE/int_to_fp.ll b/llvm/test/CodeGen/VE/int_to_fp.ll index 5069a0ca0d7d5..2e850142e2e9f 100644 --- a/llvm/test/CodeGen/VE/int_to_fp.ll +++ b/llvm/test/CodeGen/VE/int_to_fp.ll @@ -4,6 +4,7 @@ define float @c2f(i8 signext %a) { ; CHECK-LABEL: c2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -15,6 +16,7 @@ entry: define float @s2f(i16 signext %a) { ; CHECK-LABEL: s2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -26,6 +28,7 @@ entry: define float @i2f(i32 %a) { ; CHECK-LABEL: i2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -49,6 +52,7 @@ entry: define float @uc2f(i8 zeroext %a) { ; CHECK-LABEL: uc2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -60,6 +64,7 @@ entry: define float @us2f(i16 zeroext %a) { ; CHECK-LABEL: us2f: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.s.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -71,7 +76,7 @@ entry: define float @ui2f(i32 %a) { ; CHECK-LABEL: ui2f: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: cvt.d.l %s0, %s0 ; CHECK-NEXT: cvt.s.d %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -106,6 +111,7 @@ entry: define double @c2d(i8 signext %a) { ; CHECK-LABEL: c2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -117,6 +123,7 @@ entry: define double @s2d(i16 signext %a) { ; CHECK-LABEL: s2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -128,6 +135,7 @@ entry: define double @i2d(i32 %a) { ; CHECK-LABEL: i2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -150,6 +158,7 @@ entry: define double @uc2d(i8 zeroext %a) { ; CHECK-LABEL: uc2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -161,6 +170,7 @@ entry: define double @us2d(i16 zeroext %a) { ; CHECK-LABEL: us2d: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cvt.d.w %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: @@ -172,7 +182,7 @@ entry: define double @ui2d(i32 %a) { ; CHECK-LABEL: ui2d: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: cvt.d.l %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 entry: diff --git a/llvm/test/CodeGen/VE/left_shift.ll b/llvm/test/CodeGen/VE/left_shift.ll index d568846974dd9..fa595a916e529 100644 --- a/llvm/test/CodeGen/VE/left_shift.ll +++ b/llvm/test/CodeGen/VE/left_shift.ll @@ -3,9 +3,11 @@ define signext i8 @func1(i8 signext %0, i8 signext %1) { ; CHECK-LABEL: func1: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s1 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %3 = sext i8 %0 to i32 %4 = sext i8 %1 to i32 @@ -17,9 +19,11 @@ define signext i8 @func1(i8 signext %0, i8 signext %1) { define signext i16 @func2(i16 signext %0, i16 signext %1) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s1 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %3 = sext i16 %0 to i32 %4 = sext i16 %1 to i32 @@ -31,6 +35,8 @@ define signext i16 @func2(i16 signext %0, i16 signext %1) { define i32 @func3(i32 %0, i32 %1) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = shl i32 %0, %1 @@ -50,6 +56,8 @@ define i64 @func4(i64 %0, i64 %1) { define zeroext i8 @func6(i8 zeroext %0, i8 zeroext %1) { ; CHECK-LABEL: func6: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s1 ; CHECK-NEXT: and %s0, %s0, (56)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -63,6 +71,8 @@ define zeroext i8 @func6(i8 zeroext %0, i8 zeroext %1) { define zeroext i16 @func7(i16 zeroext %0, i16 zeroext %1) { ; CHECK-LABEL: func7: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s1 ; CHECK-NEXT: and %s0, %s0, (48)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -76,6 +86,8 @@ define zeroext i16 @func7(i16 zeroext %0, i16 zeroext %1) { define i32 @func8(i32 %0, i32 %1) { ; CHECK-LABEL: func8: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = shl i32 %0, %1 @@ -95,9 +107,10 @@ define i64 @func9(i64 %0, i64 %1) { define signext i8 @func11(i8 signext %0) { ; CHECK-LABEL: func11: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, 5 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %2 = shl i8 %0, 5 ret i8 %2 @@ -106,9 +119,10 @@ define signext i8 @func11(i8 signext %0) { define signext i16 @func12(i16 signext %0) { ; CHECK-LABEL: func12: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, 5 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %2 = shl i16 %0, 5 ret i16 %2 @@ -117,6 +131,7 @@ define signext i16 @func12(i16 signext %0) { define i32 @func13(i32 %0) { ; CHECK-LABEL: func13: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, 5 ; CHECK-NEXT: or %s11, 0, %s9 %2 = shl i32 %0, 5 @@ -135,8 +150,10 @@ define i64 @func14(i64 %0) { define zeroext i8 @func16(i8 zeroext %0) { ; CHECK-LABEL: func16: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, 5 -; CHECK-NEXT: and %s0, %s0, (56)0 +; CHECK-NEXT: lea %s1, 224 +; CHECK-NEXT: and %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = shl i8 %0, 5 ret i8 %2 @@ -145,8 +162,10 @@ define zeroext i8 @func16(i8 zeroext %0) { define zeroext i16 @func17(i16 zeroext %0) { ; CHECK-LABEL: func17: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, 5 -; CHECK-NEXT: and %s0, %s0, (48)0 +; CHECK-NEXT: lea %s1, 65504 +; CHECK-NEXT: and %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = shl i16 %0, 5 ret i16 %2 @@ -155,6 +174,7 @@ define zeroext i16 @func17(i16 zeroext %0) { define i32 @func18(i32 %0) { ; CHECK-LABEL: func18: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, 5 ; CHECK-NEXT: or %s11, 0, %s9 %2 = shl i32 %0, 5 diff --git a/llvm/test/CodeGen/VE/load_off.ll b/llvm/test/CodeGen/VE/load_off.ll index cc3da7a3a1cd5..2ebd616e533cc 100644 --- a/llvm/test/CodeGen/VE/load_off.ll +++ b/llvm/test/CodeGen/VE/load_off.ll @@ -118,7 +118,7 @@ define zeroext i32 @loadi32z() { ; CHECK-NEXT: lea %s0, bufi32+8@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s0, bufi32+8@hi(, %s0) -; CHECK-NEXT: ldl.sx %s0, (, %s0) +; CHECK-NEXT: ldl.zx %s0, (, %s0) ; CHECK-NEXT: or %s11, 0, %s9 entry: %0 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @bufi32, i64 0, i64 2), align 4 diff --git a/llvm/test/CodeGen/VE/max.ll b/llvm/test/CodeGen/VE/max.ll index 67f68b02b4510..2c342faa1f052 100644 --- a/llvm/test/CodeGen/VE/max.ll +++ b/llvm/test/CodeGen/VE/max.ll @@ -69,8 +69,6 @@ define float @max2f32(float, float) { define float @maxuf32(float, float) { ; CHECK-LABEL: maxuf32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf1 killed $sf1 def $sx1 -; CHECK-NEXT: # kill: def $sf0 killed $sf0 def $sx0 ; CHECK-NEXT: fcmp.s %s2, %s0, %s1 ; CHECK-NEXT: cmov.s.gtnan %s1, %s0, %s2 ; CHECK-NEXT: or %s0, 0, %s1 @@ -83,8 +81,6 @@ define float @maxuf32(float, float) { define float @max2uf32(float, float) { ; CHECK-LABEL: max2uf32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf1 killed $sf1 def $sx1 -; CHECK-NEXT: # kill: def $sf0 killed $sf0 def $sx0 ; CHECK-NEXT: fcmp.s %s2, %s0, %s1 ; CHECK-NEXT: cmov.s.genan %s1, %s0, %s2 ; CHECK-NEXT: or %s0, 0, %s1 @@ -141,6 +137,8 @@ define i64 @max2u64(i64, i64) { define i32 @maxi32(i32, i32) { ; CHECK-LABEL: maxi32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: maxs.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sgt i32 %0, %1 @@ -151,6 +149,8 @@ define i32 @maxi32(i32, i32) { define i32 @max2i32(i32, i32) { ; CHECK-LABEL: max2i32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: maxs.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sge i32 %0, %1 @@ -161,11 +161,10 @@ define i32 @max2i32(i32, i32) { define i32 @maxu32(i32, i32) { ; CHECK-LABEL: maxu32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw1 killed $sw1 def $sx1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 -; CHECK-NEXT: cmpu.w %s2, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s1, %s0, %s2 -; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: adds.w.sx %s2, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s2, %s0 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ugt i32 %0, %1 %4 = select i1 %3, i32 %0, i32 %1 @@ -175,11 +174,10 @@ define i32 @maxu32(i32, i32) { define i32 @max2u32(i32, i32) { ; CHECK-LABEL: max2u32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw1 killed $sw1 def $sx1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 -; CHECK-NEXT: cmpu.w %s2, %s0, %s1 -; CHECK-NEXT: cmov.w.ge %s1, %s0, %s2 -; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: adds.w.sx %s2, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s2, %s0 +; CHECK-NEXT: cmov.w.ge %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp uge i32 %0, %1 %4 = select i1 %3, i32 %0, i32 %1 diff --git a/llvm/test/CodeGen/VE/min.ll b/llvm/test/CodeGen/VE/min.ll index 3e28f757d9e98..dd6ad8460c806 100644 --- a/llvm/test/CodeGen/VE/min.ll +++ b/llvm/test/CodeGen/VE/min.ll @@ -67,8 +67,6 @@ define float @min2f32(float, float) { define float @minuf32(float, float) { ; CHECK-LABEL: minuf32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf1 killed $sf1 def $sx1 -; CHECK-NEXT: # kill: def $sf0 killed $sf0 def $sx0 ; CHECK-NEXT: fcmp.s %s2, %s0, %s1 ; CHECK-NEXT: cmov.s.ltnan %s1, %s0, %s2 ; CHECK-NEXT: or %s0, 0, %s1 @@ -81,8 +79,6 @@ define float @minuf32(float, float) { define float @min2uf32(float, float) { ; CHECK-LABEL: min2uf32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf1 killed $sf1 def $sx1 -; CHECK-NEXT: # kill: def $sf0 killed $sf0 def $sx0 ; CHECK-NEXT: fcmp.s %s2, %s0, %s1 ; CHECK-NEXT: cmov.s.lenan %s1, %s0, %s2 ; CHECK-NEXT: or %s0, 0, %s1 @@ -139,6 +135,8 @@ define i64 @min2u64(i64, i64) { define i32 @mini32(i32, i32) { ; CHECK-LABEL: mini32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: mins.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp slt i32 %0, %1 @@ -149,6 +147,8 @@ define i32 @mini32(i32, i32) { define i32 @min2i32(i32, i32) { ; CHECK-LABEL: min2i32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: mins.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sle i32 %0, %1 @@ -159,11 +159,10 @@ define i32 @min2i32(i32, i32) { define i32 @minu32(i32, i32) { ; CHECK-LABEL: minu32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw1 killed $sw1 def $sx1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 -; CHECK-NEXT: cmpu.w %s2, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s1, %s0, %s2 -; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: adds.w.sx %s2, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s2, %s0 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ult i32 %0, %1 %4 = select i1 %3, i32 %0, i32 %1 @@ -173,11 +172,10 @@ define i32 @minu32(i32, i32) { define i32 @min2u32(i32, i32) { ; CHECK-LABEL: min2u32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw1 killed $sw1 def $sx1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 -; CHECK-NEXT: cmpu.w %s2, %s0, %s1 -; CHECK-NEXT: cmov.w.le %s1, %s0, %s2 -; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: adds.w.sx %s2, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s2, %s0 +; CHECK-NEXT: cmov.w.le %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ule i32 %0, %1 %4 = select i1 %3, i32 %0, i32 %1 @@ -187,10 +185,11 @@ define i32 @min2u32(i32, i32) { define zeroext i1 @mini1(i1 zeroext, i1 zeroext) { ; CHECK-LABEL: mini1: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw1 killed $sw1 def $sx1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: and %s2, %s1, %s0 ; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0 -; CHECK-NEXT: or %s0, 0, %s2 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = xor i1 %0, true %4 = and i1 %3, %1 diff --git a/llvm/test/CodeGen/VE/multiply.ll b/llvm/test/CodeGen/VE/multiply.ll index dabb6cf85d12f..83b7a67ff453b 100644 --- a/llvm/test/CodeGen/VE/multiply.ll +++ b/llvm/test/CodeGen/VE/multiply.ll @@ -3,9 +3,11 @@ define signext i8 @func1(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: func1: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: muls.w.sx %s0, %s1, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %r = mul i8 %b, %a ret i8 %r @@ -14,9 +16,11 @@ define signext i8 @func1(i8 signext %a, i8 signext %b) { define signext i16 @func2(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: muls.w.sx %s0, %s1, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %r = mul i16 %b, %a ret i16 %r @@ -25,6 +29,8 @@ define signext i16 @func2(i16 signext %a, i16 signext %b) { define i32 @func3(i32 %a, i32 %b) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: muls.w.sx %s0, %s1, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = mul nsw i32 %b, %a @@ -43,6 +49,8 @@ define i64 @func4(i64 %a, i64 %b) { define zeroext i8 @func5(i8 zeroext %a, i8 zeroext %b) { ; CHECK-LABEL: func5: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: muls.w.sx %s0, %s1, %s0 ; CHECK-NEXT: and %s0, %s0, (56)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -53,6 +61,8 @@ define zeroext i8 @func5(i8 zeroext %a, i8 zeroext %b) { define zeroext i16 @func6(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: func6: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: muls.w.sx %s0, %s1, %s0 ; CHECK-NEXT: and %s0, %s0, (48)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -63,6 +73,8 @@ define zeroext i16 @func6(i16 zeroext %a, i16 zeroext %b) { define i32 @func7(i32 %a, i32 %b) { ; CHECK-LABEL: func7: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: muls.w.sx %s0, %s1, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = mul i32 %b, %a @@ -81,9 +93,10 @@ define i64 @func8(i64 %a, i64 %b) { define signext i8 @func9(i8 signext %a) { ; CHECK-LABEL: func9: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: muls.w.sx %s0, 5, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %r = mul i8 %a, 5 ret i8 %r @@ -92,9 +105,10 @@ define signext i8 @func9(i8 signext %a) { define signext i16 @func10(i16 signext %a) { ; CHECK-LABEL: func10: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: muls.w.sx %s0, 5, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %r = mul i16 %a, 5 ret i16 %r @@ -103,6 +117,7 @@ define signext i16 @func10(i16 signext %a) { define i32 @func11(i32 %a) { ; CHECK-LABEL: func11: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: muls.w.sx %s0, 5, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = mul nsw i32 %a, 5 @@ -121,6 +136,7 @@ define i64 @func12(i64 %a) { define zeroext i8 @func13(i8 zeroext %a) { ; CHECK-LABEL: func13: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: muls.w.sx %s0, 5, %s0 ; CHECK-NEXT: and %s0, %s0, (56)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -131,6 +147,7 @@ define zeroext i8 @func13(i8 zeroext %a) { define zeroext i16 @func14(i16 zeroext %a) { ; CHECK-LABEL: func14: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: muls.w.sx %s0, 5, %s0 ; CHECK-NEXT: and %s0, %s0, (48)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -141,6 +158,7 @@ define zeroext i16 @func14(i16 zeroext %a) { define i32 @func15(i32 %a) { ; CHECK-LABEL: func15: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: muls.w.sx %s0, 5, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = mul i32 %a, 5 @@ -159,6 +177,7 @@ define i64 @func16(i64 %a) { define i32 @func17(i32 %a) { ; CHECK-LABEL: func17: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s0, %s0, 31 ; CHECK-NEXT: or %s11, 0, %s9 %r = shl i32 %a, 31 diff --git a/llvm/test/CodeGen/VE/nnd.ll b/llvm/test/CodeGen/VE/nnd.ll index aea10d4834cdd..aedb85050f301 100644 --- a/llvm/test/CodeGen/VE/nnd.ll +++ b/llvm/test/CodeGen/VE/nnd.ll @@ -3,8 +3,11 @@ define signext i8 @func8s(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: func8s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i8 %a, -1 %res = and i8 %not, %b @@ -14,8 +17,11 @@ define signext i8 @func8s(i8 signext %a, i8 signext %b) { define zeroext i8 @func8z(i8 zeroext %a, i8 zeroext %b) { ; CHECK-LABEL: func8z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s1, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i8 %a, -1 %res = and i8 %b, %not @@ -25,8 +31,10 @@ define zeroext i8 @func8z(i8 zeroext %a, i8 zeroext %b) { define signext i8 @funci8s(i8 signext %a) { ; CHECK-LABEL: funci8s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, 5, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i8 %a, -1 %res = and i8 %not, 5 @@ -36,6 +44,7 @@ define signext i8 @funci8s(i8 signext %a) { define zeroext i8 @funci8z(i8 zeroext %a) { ; CHECK-LABEL: funci8z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: lea %s1, 251 ; CHECK-NEXT: and %s0, %s0, %s1 @@ -48,8 +57,11 @@ define zeroext i8 @funci8z(i8 zeroext %a) { define signext i16 @func16s(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: func16s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i16 %a, -1 %res = and i16 %not, %b @@ -59,8 +71,11 @@ define signext i16 @func16s(i16 signext %a, i16 signext %b) { define zeroext i16 @func16z(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: func16z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s1, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i16 %a, -1 %res = and i16 %b, %not @@ -70,7 +85,9 @@ define zeroext i16 @func16z(i16 zeroext %a, i16 zeroext %b) { define signext i16 @funci16s(i16 signext %a) { ; CHECK-LABEL: funci16s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i16 %a, -1 %res = and i16 %not, 65535 @@ -80,8 +97,10 @@ define signext i16 @funci16s(i16 signext %a) { define zeroext i16 @funci16z(i16 zeroext %a) { ; CHECK-LABEL: funci16z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, (52)0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i16 %a, -1 %res = and i16 4095, %not @@ -91,8 +110,11 @@ define zeroext i16 @funci16z(i16 zeroext %a) { define signext i32 @func32s(i32 signext %a, i32 signext %b) { ; CHECK-LABEL: func32s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i32 %a, -1 %res = and i32 %not, %b @@ -102,8 +124,11 @@ define signext i32 @func32s(i32 signext %a, i32 signext %b) { define zeroext i32 @func32z(i32 zeroext %a, i32 zeroext %b) { ; CHECK-LABEL: func32z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, %s1 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i32 %a, -1 %res = and i32 %not, %b @@ -113,8 +138,10 @@ define zeroext i32 @func32z(i32 zeroext %a, i32 zeroext %b) { define signext i32 @funci32s(i32 signext %a) { ; CHECK-LABEL: funci32s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, (36)0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i32 %a, -1 %res = and i32 %not, 268435455 @@ -124,8 +151,10 @@ define signext i32 @funci32s(i32 signext %a) { define zeroext i32 @funci32z(i32 zeroext %a) { ; CHECK-LABEL: funci32z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 ; CHECK-NEXT: and %s0, %s0, (36)0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %not = xor i32 %a, -1 %res = and i32 %not, 268435455 diff --git a/llvm/test/CodeGen/VE/or.ll b/llvm/test/CodeGen/VE/or.ll index 1f8c35012f811..8ddb1b5fbf80a 100644 --- a/llvm/test/CodeGen/VE/or.ll +++ b/llvm/test/CodeGen/VE/or.ll @@ -21,7 +21,9 @@ define zeroext i8 @func8z(i8 zeroext %a, i8 zeroext %b) { define signext i8 @funci8s(i8 signext %a) { ; CHECK-LABEL: funci8s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s0, 5, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = or i8 %a, 5 ret i8 %res @@ -30,8 +32,10 @@ define signext i8 @funci8s(i8 signext %a) { define zeroext i8 @funci8z(i8 zeroext %a) { ; CHECK-LABEL: funci8z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: lea %s1, 251 ; CHECK-NEXT: or %s0, %s0, %s1 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = or i8 -5, %a ret i8 %res @@ -67,7 +71,9 @@ define signext i16 @funci16s(i16 signext %a) { define zeroext i16 @funci16z(i16 zeroext %a) { ; CHECK-LABEL: funci16z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s0, %s0, (52)0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = or i16 4095, %a ret i16 %res @@ -94,7 +100,9 @@ define zeroext i32 @func32z(i32 zeroext %a, i32 zeroext %b) { define signext i32 @funci32s(i32 signext %a) { ; CHECK-LABEL: funci32s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s0, %s0, (36)0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = or i32 %a, 268435455 ret i32 %res @@ -103,7 +111,9 @@ define signext i32 @funci32s(i32 signext %a) { define zeroext i32 @funci32z(i32 zeroext %a) { ; CHECK-LABEL: funci32z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s0, %s0, (36)0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = or i32 %a, 268435455 ret i32 %res diff --git a/llvm/test/CodeGen/VE/pic_access_static_data.ll b/llvm/test/CodeGen/VE/pic_access_static_data.ll index e0741724172e9..892aa8465d99f 100644 --- a/llvm/test/CodeGen/VE/pic_access_static_data.ll +++ b/llvm/test/CodeGen/VE/pic_access_static_data.ll @@ -52,7 +52,7 @@ define i32 @main() { ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s0, dst@gotoff_hi(, %s0) ; CHECK-NEXT: ldl.sx %s1, (%s0, %s15) -; CHECK-NEXT: stl %s1, 184(, %s11) +; CHECK-NEXT: st %s1, 184(, %s11) ; CHECK-NEXT: lea %s0, .L.str@gotoff_lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s0, .L.str@gotoff_hi(, %s0) diff --git a/llvm/test/CodeGen/VE/rem.ll b/llvm/test/CodeGen/VE/rem.ll index 9fa558f5ca3cd..52ac3c3a3c9e7 100644 --- a/llvm/test/CodeGen/VE/rem.ll +++ b/llvm/test/CodeGen/VE/rem.ll @@ -16,6 +16,8 @@ define i64 @remi64(i64 %a, i64 %b) { define i32 @remi32(i32 %a, i32 %b) { ; CHECK-LABEL: remi32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divs.w.sx %s2, %s0, %s1 ; CHECK-NEXT: muls.w.sx %s1, %s2, %s1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 @@ -40,6 +42,8 @@ define i64 @remu64(i64 %a, i64 %b) { define i32 @remu32(i32 %a, i32 %b) { ; CHECK-LABEL: remu32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divu.w %s2, %s0, %s1 ; CHECK-NEXT: muls.w.sx %s1, %s2, %s1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 @@ -52,11 +56,13 @@ define i32 @remu32(i32 %a, i32 %b) { define signext i16 @remi16(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: remi16: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divs.w.sx %s2, %s0, %s1 ; CHECK-NEXT: muls.w.sx %s1, %s2, %s1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %a32 = sext i16 %a to i32 %b32 = sext i16 %b to i32 @@ -69,9 +75,12 @@ define signext i16 @remi16(i16 signext %a, i16 signext %b) { define zeroext i16 @remu16(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: remu16: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divu.w %s2, %s0, %s1 ; CHECK-NEXT: muls.w.sx %s1, %s2, %s1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = urem i16 %a, %b ret i16 %r @@ -81,11 +90,13 @@ define zeroext i16 @remu16(i16 zeroext %a, i16 zeroext %b) { define signext i8 @remi8(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: remi8: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divs.w.sx %s2, %s0, %s1 ; CHECK-NEXT: muls.w.sx %s1, %s2, %s1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %a32 = sext i8 %a to i32 %b32 = sext i8 %b to i32 @@ -98,9 +109,12 @@ define signext i8 @remi8(i8 signext %a, i8 signext %b) { define zeroext i8 @remu8(i8 zeroext %a, i8 zeroext %b) { ; CHECK-LABEL: remu8: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divu.w %s2, %s0, %s1 ; CHECK-NEXT: muls.w.sx %s1, %s2, %s1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %r = urem i8 %a, %b ret i8 %r @@ -122,6 +136,7 @@ define i64 @remi64ri(i64 %a, i64 %b) { define i32 @remi32ri(i32 %a, i32 %b) { ; CHECK-LABEL: remi32ri: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divs.w.sx %s1, %s0, (62)0 ; CHECK-NEXT: muls.w.sx %s1, 3, %s1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 @@ -146,6 +161,7 @@ define i64 @remu64ri(i64 %a, i64 %b) { define i32 @remu32ri(i32 %a, i32 %b) { ; CHECK-LABEL: remu32ri: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: divu.w %s1, %s0, (62)0 ; CHECK-NEXT: muls.w.sx %s1, 3, %s1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 @@ -170,8 +186,9 @@ define i64 @remi64li(i64 %a, i64 %b) { define i32 @remi32li(i32 %a, i32 %b) { ; CHECK-LABEL: remi32li: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: divs.w.sx %s0, 3, %s1 -; CHECK-NEXT: muls.w.sx %s0, %s0, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 +; CHECK-NEXT: divs.w.sx %s1, 3, %s0 +; CHECK-NEXT: muls.w.sx %s0, %s1, %s0 ; CHECK-NEXT: subs.w.sx %s0, 3, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = srem i32 3, %b @@ -194,8 +211,9 @@ define i64 @remu64li(i64 %a, i64 %b) { define i32 @remu32li(i32 %a, i32 %b) { ; CHECK-LABEL: remu32li: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: divu.w %s0, 3, %s1 -; CHECK-NEXT: muls.w.sx %s0, %s0, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s1, (0)1 +; CHECK-NEXT: divu.w %s1, 3, %s0 +; CHECK-NEXT: muls.w.sx %s0, %s1, %s0 ; CHECK-NEXT: subs.w.sx %s0, 3, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r = urem i32 3, %b diff --git a/llvm/test/CodeGen/VE/right_shift.ll b/llvm/test/CodeGen/VE/right_shift.ll index faad722a30531..87ac6df7e62ed 100644 --- a/llvm/test/CodeGen/VE/right_shift.ll +++ b/llvm/test/CodeGen/VE/right_shift.ll @@ -3,7 +3,10 @@ define signext i8 @func1(i8 signext %0, i8 signext %1) { ; CHECK-LABEL: func1: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.w.sx %s0, %s0, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = sext i8 %0 to i32 %4 = sext i8 %1 to i32 @@ -15,7 +18,10 @@ define signext i8 @func1(i8 signext %0, i8 signext %1) { define signext i16 @func2(i16 signext %0, i16 signext %1) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.w.sx %s0, %s0, %s1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = sext i16 %0 to i32 %4 = sext i16 %1 to i32 @@ -27,6 +33,8 @@ define signext i16 @func2(i16 signext %0, i16 signext %1) { define i32 @func3(i32 %0, i32 %1) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = ashr i32 %0, %1 @@ -46,10 +54,11 @@ define i64 @func4(i64 %0, i64 %1) { define zeroext i8 @func7(i8 zeroext %0, i8 zeroext %1) { ; CHECK-LABEL: func7: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: srl %s0, %s0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = zext i8 %0 to i32 %4 = zext i8 %1 to i32 @@ -61,10 +70,11 @@ define zeroext i8 @func7(i8 zeroext %0, i8 zeroext %1) { define zeroext i16 @func8(i16 zeroext %0, i16 zeroext %1) { ; CHECK-LABEL: func8: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: srl %s0, %s0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = zext i16 %0 to i32 %4 = zext i16 %1 to i32 @@ -76,10 +86,10 @@ define zeroext i16 @func8(i16 zeroext %0, i16 zeroext %1) { define i32 @func9(i32 %0, i32 %1) { ; CHECK-LABEL: func9: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: srl %s0, %s0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %3 = lshr i32 %0, %1 ret i32 %3 @@ -98,7 +108,9 @@ define i64 @func10(i64 %0, i64 %1) { define signext i8 @func12(i8 signext %0) { ; CHECK-LABEL: func12: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.w.sx %s0, %s0, 5 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = ashr i8 %0, 5 ret i8 %2 @@ -107,7 +119,9 @@ define signext i8 @func12(i8 signext %0) { define signext i16 @func13(i16 signext %0) { ; CHECK-LABEL: func13: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.w.sx %s0, %s0, 5 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = ashr i16 %0, 5 ret i16 %2 @@ -116,6 +130,7 @@ define signext i16 @func13(i16 signext %0) { define i32 @func14(i32 %0) { ; CHECK-LABEL: func14: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sra.w.sx %s0, %s0, 5 ; CHECK-NEXT: or %s11, 0, %s9 %2 = ashr i32 %0, 5 @@ -134,10 +149,10 @@ define i64 @func15(i64 %0) { define zeroext i8 @func17(i8 zeroext %0) { ; CHECK-LABEL: func17: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: srl %s0, %s0, 5 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = lshr i8 %0, 5 ret i8 %2 @@ -146,10 +161,10 @@ define zeroext i8 @func17(i8 zeroext %0) { define zeroext i16 @func18(i16 zeroext %0) { ; CHECK-LABEL: func18: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: srl %s0, %s0, 5 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = lshr i16 %0, 5 ret i16 %2 @@ -158,10 +173,9 @@ define zeroext i16 @func18(i16 zeroext %0) { define i32 @func19(i32 %0) { ; CHECK-LABEL: func19: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: srl %s0, %s0, 5 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 ; CHECK-NEXT: or %s11, 0, %s9 %2 = lshr i32 %0, 5 ret i32 %2 diff --git a/llvm/test/CodeGen/VE/rotl.ll b/llvm/test/CodeGen/VE/rotl.ll index e7c498f1d34d2..cc5e004478ab5 100644 --- a/llvm/test/CodeGen/VE/rotl.ll +++ b/llvm/test/CodeGen/VE/rotl.ll @@ -3,6 +3,7 @@ define i64 @func1(i64 %a, i32 %b) { ; CHECK-LABEL: func1: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: sll %s2, %s0, %s1 ; CHECK-NEXT: lea %s3, 64 ; CHECK-NEXT: subs.w.sx %s1, %s3, %s1 @@ -21,7 +22,8 @@ define i64 @func1(i64 %a, i32 %b) { define i32 @func2(i32 %a, i32 %b) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: sla.w.sx %s2, %s0, %s1 ; CHECK-NEXT: subs.w.sx %s1, 32, %s1 ; CHECK-NEXT: and %s0, %s0, (32)0 diff --git a/llvm/test/CodeGen/VE/rotr.ll b/llvm/test/CodeGen/VE/rotr.ll index 40734a3d5178a..93dcbbc7e0a82 100644 --- a/llvm/test/CodeGen/VE/rotr.ll +++ b/llvm/test/CodeGen/VE/rotr.ll @@ -3,6 +3,7 @@ define i64 @func1(i64 %a, i32 %b) { ; CHECK-LABEL: func1: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 ; CHECK-NEXT: srl %s2, %s0, %s1 ; CHECK-NEXT: lea %s3, 64 ; CHECK-NEXT: subs.w.sx %s1, %s3, %s1 @@ -21,7 +22,8 @@ define i64 @func1(i64 %a, i32 %b) { define i32 @func2(i32 %a, i32 %b) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw0 killed $sw0 def $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: and %s2, %s0, (32)0 ; CHECK-NEXT: srl %s2, %s2, %s1 ; CHECK-NEXT: subs.w.sx %s1, 32, %s1 diff --git a/llvm/test/CodeGen/VE/select.ll b/llvm/test/CodeGen/VE/select.ll index 6efe073381d51..81234d3d955cc 100644 --- a/llvm/test/CodeGen/VE/select.ll +++ b/llvm/test/CodeGen/VE/select.ll @@ -3,6 +3,7 @@ define double @selectf64(i1 zeroext, double, double) { ; CHECK-LABEL: selectf64: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0 ; CHECK-NEXT: or %s0, 0, %s2 ; CHECK-NEXT: or %s11, 0, %s9 @@ -13,8 +14,7 @@ define double @selectf64(i1 zeroext, double, double) { define float @selectf32(i1 zeroext, float, float) { ; CHECK-LABEL: selectf32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 -; CHECK-NEXT: # kill: def $sf1 killed $sf1 def $sx1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0 ; CHECK-NEXT: or %s0, 0, %s2 ; CHECK-NEXT: or %s11, 0, %s9 @@ -25,6 +25,7 @@ define float @selectf32(i1 zeroext, float, float) { define i64 @selecti64(i1 zeroext, i64, i64) { ; CHECK-LABEL: selecti64: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0 ; CHECK-NEXT: or %s0, 0, %s2 ; CHECK-NEXT: or %s11, 0, %s9 @@ -35,10 +36,10 @@ define i64 @selecti64(i1 zeroext, i64, i64) { define i32 @selecti32(i1 zeroext, i32, i32) { ; CHECK-LABEL: selecti32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: # kill: def $sw1 killed $sw1 def $sx1 -; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0 -; CHECK-NEXT: or %s0, 0, %s2 +; CHECK-NEXT: adds.w.sx %s3, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s2, (0)1 +; CHECK-NEXT: cmov.w.ne %s0, %s1, %s3 ; CHECK-NEXT: or %s11, 0, %s9 %4 = select i1 %0, i32 %1, i32 %2 ret i32 %4 @@ -47,10 +48,11 @@ define i32 @selecti32(i1 zeroext, i32, i32) { define zeroext i1 @selecti1(i1 zeroext, i1 zeroext, i1 zeroext) { ; CHECK-LABEL: selecti1: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: # kill: def $sw1 killed $sw1 def $sx1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 ; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0 -; CHECK-NEXT: or %s0, 0, %s2 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %4 = select i1 %0, i1 %1, i1 %2 ret i1 %4 diff --git a/llvm/test/CodeGen/VE/selectccf32.ll b/llvm/test/CodeGen/VE/selectccf32.ll index 748a3e9275571..2832be6bc12a6 100644 --- a/llvm/test/CodeGen/VE/selectccf32.ll +++ b/llvm/test/CodeGen/VE/selectccf32.ll @@ -23,8 +23,6 @@ define float @selectccat(float, float, float, float) { define float @selectccoeq(float, float, float, float) { ; CHECK-LABEL: selectccoeq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.eq %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -37,8 +35,6 @@ define float @selectccoeq(float, float, float, float) { define float @selectccone(float, float, float, float) { ; CHECK-LABEL: selectccone: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.ne %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -51,8 +47,6 @@ define float @selectccone(float, float, float, float) { define float @selectccogt(float, float, float, float) { ; CHECK-LABEL: selectccogt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -65,8 +59,6 @@ define float @selectccogt(float, float, float, float) { define float @selectccoge(float, float, float, float) { ; CHECK-LABEL: selectccoge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.ge %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -79,8 +71,6 @@ define float @selectccoge(float, float, float, float) { define float @selectccolt(float, float, float, float) { ; CHECK-LABEL: selectccolt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.lt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -93,8 +83,6 @@ define float @selectccolt(float, float, float, float) { define float @selectccole(float, float, float, float) { ; CHECK-LABEL: selectccole: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.le %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -107,8 +95,6 @@ define float @selectccole(float, float, float, float) { define float @selectccord(float, float, float, float) { ; CHECK-LABEL: selectccord: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.num %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -121,8 +107,6 @@ define float @selectccord(float, float, float, float) { define float @selectccuno(float, float, float, float) { ; CHECK-LABEL: selectccuno: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.nan %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -135,8 +119,6 @@ define float @selectccuno(float, float, float, float) { define float @selectccueq(float, float, float, float) { ; CHECK-LABEL: selectccueq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.eqnan %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -149,8 +131,6 @@ define float @selectccueq(float, float, float, float) { define float @selectccune(float, float, float, float) { ; CHECK-LABEL: selectccune: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.nenan %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -163,8 +143,6 @@ define float @selectccune(float, float, float, float) { define float @selectccugt(float, float, float, float) { ; CHECK-LABEL: selectccugt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.gtnan %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -177,8 +155,6 @@ define float @selectccugt(float, float, float, float) { define float @selectccuge(float, float, float, float) { ; CHECK-LABEL: selectccuge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.genan %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -191,8 +167,6 @@ define float @selectccuge(float, float, float, float) { define float @selectccult(float, float, float, float) { ; CHECK-LABEL: selectccult: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.ltnan %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -205,8 +179,6 @@ define float @selectccult(float, float, float, float) { define float @selectccule(float, float, float, float) { ; CHECK-LABEL: selectccule: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.lenan %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 diff --git a/llvm/test/CodeGen/VE/selectccf32c.ll b/llvm/test/CodeGen/VE/selectccf32c.ll index 78a9aaf96729c..54a9da4c8e460 100644 --- a/llvm/test/CodeGen/VE/selectccf32c.ll +++ b/llvm/test/CodeGen/VE/selectccf32c.ll @@ -3,12 +3,10 @@ define float @selectccsgti8(i8, i8, float, float) { ; CHECK-LABEL: selectccsgti8: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 -; CHECK-NEXT: sla.w.sx %s1, %s1, 24 -; CHECK-NEXT: sra.w.sx %s1, %s1, 24 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s1, %s1, 56 +; CHECK-NEXT: sra.l %s1, %s1, 56 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -21,12 +19,10 @@ define float @selectccsgti8(i8, i8, float, float) { define float @selectccsgti16(i16, i16, float, float) { ; CHECK-LABEL: selectccsgti16: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 -; CHECK-NEXT: sla.w.sx %s1, %s1, 16 -; CHECK-NEXT: sra.w.sx %s1, %s1, 16 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s1, %s1, 48 +; CHECK-NEXT: sra.l %s1, %s1, 48 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -39,8 +35,8 @@ define float @selectccsgti16(i16, i16, float, float) { define float @selectccsgti32(i32, i32, float, float) { ; CHECK-LABEL: selectccsgti32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -53,8 +49,6 @@ define float @selectccsgti32(i32, i32, float, float) { define float @selectccsgti64(i64, i64, float, float) { ; CHECK-LABEL: selectccsgti64: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: cmps.l %s0, %s0, %s1 ; CHECK-NEXT: cmov.l.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -67,8 +61,6 @@ define float @selectccsgti64(i64, i64, float, float) { define float @selectccsgti128(i128, i128, float, float) { ; CHECK-LABEL: selectccsgti128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf5 killed $sf5 def $sx5 -; CHECK-NEXT: # kill: def $sf4 killed $sf4 def $sx4 ; CHECK-NEXT: or %s6, 0, (0)1 ; CHECK-NEXT: cmps.l %s1, %s1, %s3 ; CHECK-NEXT: or %s3, 0, %s6 @@ -89,8 +81,6 @@ define float @selectccsgti128(i128, i128, float, float) { define float @selectccogtf32(float, float, float, float) { ; CHECK-LABEL: selectccogtf32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -103,8 +93,6 @@ define float @selectccogtf32(float, float, float, float) { define float @selectccogtf64(double, double, float, float) { ; CHECK-LABEL: selectccogtf64: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.d %s0, %s0, %s1 ; CHECK-NEXT: cmov.d.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 diff --git a/llvm/test/CodeGen/VE/selectccf32i.ll b/llvm/test/CodeGen/VE/selectccf32i.ll index e8285c818c54d..5e2698b4feb17 100644 --- a/llvm/test/CodeGen/VE/selectccf32i.ll +++ b/llvm/test/CodeGen/VE/selectccf32i.ll @@ -23,8 +23,6 @@ define float @selectccat(float, float, float, float) { define float @selectccoeq(float, float, float, float) { ; CHECK-LABEL: selectccoeq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.eq %s3, %s2, %s0 @@ -38,8 +36,6 @@ define float @selectccoeq(float, float, float, float) { define float @selectccone(float, float, float, float) { ; CHECK-LABEL: selectccone: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.ne %s3, %s2, %s0 @@ -53,8 +49,6 @@ define float @selectccone(float, float, float, float) { define float @selectccogt(float, float, float, float) { ; CHECK-LABEL: selectccogt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.gt %s3, %s2, %s0 @@ -68,8 +62,6 @@ define float @selectccogt(float, float, float, float) { define float @selectccoge(float, float, float, float) { ; CHECK-LABEL: selectccoge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.ge %s3, %s2, %s0 @@ -83,8 +75,6 @@ define float @selectccoge(float, float, float, float) { define float @selectccolt(float, float, float, float) { ; CHECK-LABEL: selectccolt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.lt %s3, %s2, %s0 @@ -98,8 +88,6 @@ define float @selectccolt(float, float, float, float) { define float @selectccole(float, float, float, float) { ; CHECK-LABEL: selectccole: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.le %s3, %s2, %s0 @@ -113,8 +101,6 @@ define float @selectccole(float, float, float, float) { define float @selectccord(float, float, float, float) { ; CHECK-LABEL: selectccord: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s0 ; CHECK-NEXT: cmov.s.num %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -127,8 +113,6 @@ define float @selectccord(float, float, float, float) { define float @selectccuno(float, float, float, float) { ; CHECK-LABEL: selectccuno: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: fcmp.s %s0, %s0, %s0 ; CHECK-NEXT: cmov.s.nan %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -141,8 +125,6 @@ define float @selectccuno(float, float, float, float) { define float @selectccueq(float, float, float, float) { ; CHECK-LABEL: selectccueq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.eqnan %s3, %s2, %s0 @@ -156,8 +138,6 @@ define float @selectccueq(float, float, float, float) { define float @selectccune(float, float, float, float) { ; CHECK-LABEL: selectccune: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.nenan %s3, %s2, %s0 @@ -171,8 +151,6 @@ define float @selectccune(float, float, float, float) { define float @selectccugt(float, float, float, float) { ; CHECK-LABEL: selectccugt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.gtnan %s3, %s2, %s0 @@ -186,8 +164,6 @@ define float @selectccugt(float, float, float, float) { define float @selectccuge(float, float, float, float) { ; CHECK-LABEL: selectccuge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.genan %s3, %s2, %s0 @@ -201,8 +177,6 @@ define float @selectccuge(float, float, float, float) { define float @selectccult(float, float, float, float) { ; CHECK-LABEL: selectccult: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.ltnan %s3, %s2, %s0 @@ -216,8 +190,6 @@ define float @selectccult(float, float, float, float) { define float @selectccule(float, float, float, float) { ; CHECK-LABEL: selectccule: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sf3 killed $sf3 def $sx3 -; CHECK-NEXT: # kill: def $sf2 killed $sf2 def $sx2 ; CHECK-NEXT: lea.sl %s1, 0 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 ; CHECK-NEXT: cmov.s.lenan %s3, %s2, %s0 diff --git a/llvm/test/CodeGen/VE/selectccf64c.ll b/llvm/test/CodeGen/VE/selectccf64c.ll index 4481d1d6197f5..24b61ece8d545 100644 --- a/llvm/test/CodeGen/VE/selectccf64c.ll +++ b/llvm/test/CodeGen/VE/selectccf64c.ll @@ -3,10 +3,10 @@ define double @selectccsgti8(i8, i8, double, double) { ; CHECK-LABEL: selectccsgti8: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s1, %s1, 24 -; CHECK-NEXT: sra.w.sx %s1, %s1, 24 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s1, %s1, 56 +; CHECK-NEXT: sra.l %s1, %s1, 56 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -19,10 +19,10 @@ define double @selectccsgti8(i8, i8, double, double) { define double @selectccsgti16(i16, i16, double, double) { ; CHECK-LABEL: selectccsgti16: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s1, %s1, 16 -; CHECK-NEXT: sra.w.sx %s1, %s1, 16 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s1, %s1, 48 +; CHECK-NEXT: sra.l %s1, %s1, 48 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -35,6 +35,8 @@ define double @selectccsgti16(i16, i16, double, double) { define double @selectccsgti32(i32, i32, double, double) { ; CHECK-LABEL: selectccsgti32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 diff --git a/llvm/test/CodeGen/VE/selectcci32.ll b/llvm/test/CodeGen/VE/selectcci32.ll index eea115c8b936b..af1861487b892 100644 --- a/llvm/test/CodeGen/VE/selectcci32.ll +++ b/llvm/test/CodeGen/VE/selectcci32.ll @@ -3,11 +3,12 @@ define i32 @selectcceq(i32, i32, i32, i32) { ; CHECK-LABEL: selectcceq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.eq %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.eq %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp eq i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -17,11 +18,12 @@ define i32 @selectcceq(i32, i32, i32, i32) { define i32 @selectccne(i32, i32, i32, i32) { ; CHECK-LABEL: selectccne: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.ne %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.ne %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ne i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -31,11 +33,12 @@ define i32 @selectccne(i32, i32, i32, i32) { define i32 @selectccsgt(i32, i32, i32, i32) { ; CHECK-LABEL: selectccsgt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sgt i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -45,11 +48,12 @@ define i32 @selectccsgt(i32, i32, i32, i32) { define i32 @selectccsge(i32, i32, i32, i32) { ; CHECK-LABEL: selectccsge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.ge %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.ge %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sge i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -59,11 +63,12 @@ define i32 @selectccsge(i32, i32, i32, i32) { define i32 @selectccslt(i32, i32, i32, i32) { ; CHECK-LABEL: selectccslt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp slt i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -73,11 +78,12 @@ define i32 @selectccslt(i32, i32, i32, i32) { define i32 @selectccsle(i32, i32, i32, i32) { ; CHECK-LABEL: selectccsle: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.le %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.le %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sle i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -87,11 +93,12 @@ define i32 @selectccsle(i32, i32, i32, i32) { define i32 @selectccugt(i32, i32, i32, i32) { ; CHECK-LABEL: selectccugt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ugt i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -101,11 +108,12 @@ define i32 @selectccugt(i32, i32, i32, i32) { define i32 @selectccuge(i32, i32, i32, i32) { ; CHECK-LABEL: selectccuge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.ge %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.ge %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp uge i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -115,11 +123,12 @@ define i32 @selectccuge(i32, i32, i32, i32) { define i32 @selectccult(i32, i32, i32, i32) { ; CHECK-LABEL: selectccult: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ult i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -129,11 +138,12 @@ define i32 @selectccult(i32, i32, i32, i32) { define i32 @selectccule(i32, i32, i32, i32) { ; CHECK-LABEL: selectccule: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.le %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.le %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ule i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -143,11 +153,12 @@ define i32 @selectccule(i32, i32, i32, i32) { define i32 @selectccugt2(i32, i32, i32, i32) { ; CHECK-LABEL: selectccugt2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ugt i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -157,11 +168,12 @@ define i32 @selectccugt2(i32, i32, i32, i32) { define i32 @selectccuge2(i32, i32, i32, i32) { ; CHECK-LABEL: selectccuge2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.ge %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.ge %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp uge i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -171,11 +183,12 @@ define i32 @selectccuge2(i32, i32, i32, i32) { define i32 @selectccult2(i32, i32, i32, i32) { ; CHECK-LABEL: selectccult2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ult i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -185,11 +198,12 @@ define i32 @selectccult2(i32, i32, i32, i32) { define i32 @selectccule2(i32, i32, i32, i32) { ; CHECK-LABEL: selectccule2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.le %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.le %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ule i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 diff --git a/llvm/test/CodeGen/VE/selectcci32c.ll b/llvm/test/CodeGen/VE/selectcci32c.ll index 474e9e0dcd4dd..e4017c46c5790 100644 --- a/llvm/test/CodeGen/VE/selectcci32c.ll +++ b/llvm/test/CodeGen/VE/selectcci32c.ll @@ -3,15 +3,14 @@ define i32 @selectccsgti8(i8, i8, i32, i32) { ; CHECK-LABEL: selectccsgti8: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: sla.w.sx %s1, %s1, 24 -; CHECK-NEXT: sra.w.sx %s1, %s1, 24 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: sll %s1, %s1, 56 +; CHECK-NEXT: sra.l %s1, %s1, 56 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s4, %s0, 56 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sgt i8 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -21,15 +20,14 @@ define i32 @selectccsgti8(i8, i8, i32, i32) { define i32 @selectccsgti16(i16, i16, i32, i32) { ; CHECK-LABEL: selectccsgti16: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: sla.w.sx %s1, %s1, 16 -; CHECK-NEXT: sra.w.sx %s1, %s1, 16 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: sll %s1, %s1, 48 +; CHECK-NEXT: sra.l %s1, %s1, 48 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s4, %s0, 48 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sgt i16 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -39,11 +37,12 @@ define i32 @selectccsgti16(i16, i16, i32, i32) { define i32 @selectccsgti32(i32, i32, i32, i32) { ; CHECK-LABEL: selectccsgti32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s4, %s1 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sgt i32 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -53,11 +52,11 @@ define i32 @selectccsgti32(i32, i32, i32, i32) { define i32 @selectccsgti64(i64, i64, i32, i32) { ; CHECK-LABEL: selectccsgti64: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 +; CHECK-NEXT: adds.w.sx %s4, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s3, (0)1 ; CHECK-NEXT: cmps.l %s0, %s0, %s1 -; CHECK-NEXT: cmov.l.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: cmov.l.gt %s2, %s4, %s0 +; CHECK-NEXT: or %s0, 0, %s2 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sgt i64 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -67,19 +66,19 @@ define i32 @selectccsgti64(i64, i64, i32, i32) { define i32 @selectccsgti128(i128, i128, i32, i32) { ; CHECK-LABEL: selectccsgti128: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw5 killed $sw5 def $sx5 -; CHECK-NEXT: # kill: def $sw4 killed $sw4 def $sx4 -; CHECK-NEXT: or %s6, 0, (0)1 +; CHECK-NEXT: adds.w.sx %s6, %s4, (0)1 +; CHECK-NEXT: adds.w.sx %s4, %s5, (0)1 +; CHECK-NEXT: or %s5, 0, (0)1 ; CHECK-NEXT: cmps.l %s1, %s1, %s3 -; CHECK-NEXT: or %s3, 0, %s6 +; CHECK-NEXT: or %s3, 0, %s5 ; CHECK-NEXT: cmov.l.gt %s3, (63)0, %s1 ; CHECK-NEXT: cmpu.l %s0, %s0, %s2 -; CHECK-NEXT: cmov.l.gt %s6, (63)0, %s0 -; CHECK-NEXT: cmov.l.eq %s3, %s6, %s1 +; CHECK-NEXT: cmov.l.gt %s5, (63)0, %s0 +; CHECK-NEXT: cmov.l.eq %s3, %s5, %s1 ; CHECK-NEXT: or %s0, 0, (0)1 ; CHECK-NEXT: cmps.w.sx %s0, %s3, %s0 -; CHECK-NEXT: cmov.w.ne %s5, %s4, %s0 -; CHECK-NEXT: or %s0, 0, %s5 +; CHECK-NEXT: cmov.w.ne %s4, %s6, %s0 +; CHECK-NEXT: or %s0, 0, %s4 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sgt i128 %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -89,11 +88,11 @@ define i32 @selectccsgti128(i128, i128, i32, i32) { define i32 @selectccogtf32(float, float, i32, i32) { ; CHECK-LABEL: selectccogtf32: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 +; CHECK-NEXT: adds.w.sx %s4, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s3, (0)1 ; CHECK-NEXT: fcmp.s %s0, %s0, %s1 -; CHECK-NEXT: cmov.s.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: cmov.s.gt %s2, %s4, %s0 +; CHECK-NEXT: or %s0, 0, %s2 ; CHECK-NEXT: or %s11, 0, %s9 %5 = fcmp ogt float %0, %1 %6 = select i1 %5, i32 %2, i32 %3 @@ -103,11 +102,11 @@ define i32 @selectccogtf32(float, float, i32, i32) { define i32 @selectccogtf64(double, double, i32, i32) { ; CHECK-LABEL: selectccogtf64: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 +; CHECK-NEXT: adds.w.sx %s4, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s3, (0)1 ; CHECK-NEXT: fcmp.d %s0, %s0, %s1 -; CHECK-NEXT: cmov.d.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: cmov.d.gt %s2, %s4, %s0 +; CHECK-NEXT: or %s0, 0, %s2 ; CHECK-NEXT: or %s11, 0, %s9 %5 = fcmp ogt double %0, %1 %6 = select i1 %5, i32 %2, i32 %3 diff --git a/llvm/test/CodeGen/VE/selectcci32i.ll b/llvm/test/CodeGen/VE/selectcci32i.ll index 0e17f83ad1781..a4cccd0ebf935 100644 --- a/llvm/test/CodeGen/VE/selectcci32i.ll +++ b/llvm/test/CodeGen/VE/selectcci32i.ll @@ -3,12 +3,12 @@ define i32 @selectcceq(i32, i32, i32, i32) { ; CHECK-LABEL: selectcceq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.eq %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 12, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.eq %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp eq i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -18,12 +18,12 @@ define i32 @selectcceq(i32, i32, i32, i32) { define i32 @selectccne(i32, i32, i32, i32) { ; CHECK-LABEL: selectccne: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.ne %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 12, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.ne %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ne i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -33,12 +33,12 @@ define i32 @selectccne(i32, i32, i32, i32) { define i32 @selectccsgt(i32, i32, i32, i32) { ; CHECK-LABEL: selectccsgt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 12, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sgt i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -48,12 +48,12 @@ define i32 @selectccsgt(i32, i32, i32, i32) { define i32 @selectccsge(i32, i32, i32, i32) { ; CHECK-LABEL: selectccsge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 11, (0)1 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 11, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sge i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -63,12 +63,12 @@ define i32 @selectccsge(i32, i32, i32, i32) { define i32 @selectccslt(i32, i32, i32, i32) { ; CHECK-LABEL: selectccslt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 12, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp slt i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -78,12 +78,12 @@ define i32 @selectccslt(i32, i32, i32, i32) { define i32 @selectccsle(i32, i32, i32, i32) { ; CHECK-LABEL: selectccsle: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 13, (0)1 -; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 13, (0)1 +; CHECK-NEXT: cmps.w.sx %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp sle i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -93,12 +93,12 @@ define i32 @selectccsle(i32, i32, i32, i32) { define i32 @selectccugt(i32, i32, i32, i32) { ; CHECK-LABEL: selectccugt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 12, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ugt i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -108,12 +108,12 @@ define i32 @selectccugt(i32, i32, i32, i32) { define i32 @selectccuge(i32, i32, i32, i32) { ; CHECK-LABEL: selectccuge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 11, (0)1 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 11, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp uge i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -123,12 +123,12 @@ define i32 @selectccuge(i32, i32, i32, i32) { define i32 @selectccult(i32, i32, i32, i32) { ; CHECK-LABEL: selectccult: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 12, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ult i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -138,12 +138,12 @@ define i32 @selectccult(i32, i32, i32, i32) { define i32 @selectccule(i32, i32, i32, i32) { ; CHECK-LABEL: selectccule: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 13, (0)1 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 13, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ule i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -153,12 +153,12 @@ define i32 @selectccule(i32, i32, i32, i32) { define i32 @selectccugt2(i32, i32, i32, i32) { ; CHECK-LABEL: selectccugt2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 12, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ugt i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -168,12 +168,12 @@ define i32 @selectccugt2(i32, i32, i32, i32) { define i32 @selectccuge2(i32, i32, i32, i32) { ; CHECK-LABEL: selectccuge2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 11, (0)1 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 11, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.gt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp uge i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -183,12 +183,12 @@ define i32 @selectccuge2(i32, i32, i32, i32) { define i32 @selectccult2(i32, i32, i32, i32) { ; CHECK-LABEL: selectccult2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 12, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ult i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 @@ -198,12 +198,12 @@ define i32 @selectccult2(i32, i32, i32, i32) { define i32 @selectccule2(i32, i32, i32, i32) { ; CHECK-LABEL: selectccule2: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # kill: def $sw3 killed $sw3 def $sx3 -; CHECK-NEXT: # kill: def $sw2 killed $sw2 def $sx2 -; CHECK-NEXT: or %s1, 13, (0)1 -; CHECK-NEXT: cmpu.w %s0, %s0, %s1 -; CHECK-NEXT: cmov.w.lt %s3, %s2, %s0 -; CHECK-NEXT: or %s0, 0, %s3 +; CHECK-NEXT: adds.w.sx %s1, %s0, (0)1 +; CHECK-NEXT: adds.w.sx %s2, %s2, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s3, (0)1 +; CHECK-NEXT: or %s3, 13, (0)1 +; CHECK-NEXT: cmpu.w %s1, %s1, %s3 +; CHECK-NEXT: cmov.w.lt %s0, %s2, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %5 = icmp ule i32 %0, 12 %6 = select i1 %5, i32 %2, i32 %3 diff --git a/llvm/test/CodeGen/VE/selectcci64c.ll b/llvm/test/CodeGen/VE/selectcci64c.ll index 7bb7d7fee1c31..276f23d9a5ffd 100644 --- a/llvm/test/CodeGen/VE/selectcci64c.ll +++ b/llvm/test/CodeGen/VE/selectcci64c.ll @@ -3,10 +3,10 @@ define i64 @selectccsgti8(i8, i8, i64, i64) { ; CHECK-LABEL: selectccsgti8: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s1, %s1, 24 -; CHECK-NEXT: sra.w.sx %s1, %s1, 24 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s1, %s1, 56 +; CHECK-NEXT: sra.l %s1, %s1, 56 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -19,10 +19,10 @@ define i64 @selectccsgti8(i8, i8, i64, i64) { define i64 @selectccsgti16(i16, i16, i64, i64) { ; CHECK-LABEL: selectccsgti16: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: sla.w.sx %s1, %s1, 16 -; CHECK-NEXT: sra.w.sx %s1, %s1, 16 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s1, %s1, 48 +; CHECK-NEXT: sra.l %s1, %s1, 48 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 @@ -35,6 +35,8 @@ define i64 @selectccsgti16(i16, i16, i64, i64) { define i64 @selectccsgti32(i32, i32, i64, i64) { ; CHECK-LABEL: selectccsgti32: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 ; CHECK-NEXT: cmov.w.gt %s3, %s2, %s0 ; CHECK-NEXT: or %s0, 0, %s3 diff --git a/llvm/test/CodeGen/VE/setccf32.ll b/llvm/test/CodeGen/VE/setccf32.ll index 6ced8ce53b9c4..f2e9062fcf624 100644 --- a/llvm/test/CodeGen/VE/setccf32.ll +++ b/llvm/test/CodeGen/VE/setccf32.ll @@ -21,10 +21,10 @@ define zeroext i1 @setccat(float, float) { define zeroext i1 @setccoeq(float, float) { ; CHECK-LABEL: setccoeq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.eq %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.eq %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp oeq float %0, %1 ret i1 %3 @@ -33,10 +33,10 @@ define zeroext i1 @setccoeq(float, float) { define zeroext i1 @setccone(float, float) { ; CHECK-LABEL: setccone: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.ne %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.ne %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp one float %0, %1 ret i1 %3 @@ -45,10 +45,10 @@ define zeroext i1 @setccone(float, float) { define zeroext i1 @setccogt(float, float) { ; CHECK-LABEL: setccogt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ogt float %0, %1 ret i1 %3 @@ -57,10 +57,10 @@ define zeroext i1 @setccogt(float, float) { define zeroext i1 @setccoge(float, float) { ; CHECK-LABEL: setccoge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.ge %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.ge %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp oge float %0, %1 ret i1 %3 @@ -69,10 +69,10 @@ define zeroext i1 @setccoge(float, float) { define zeroext i1 @setccolt(float, float) { ; CHECK-LABEL: setccolt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp olt float %0, %1 ret i1 %3 @@ -81,10 +81,10 @@ define zeroext i1 @setccolt(float, float) { define zeroext i1 @setccole(float, float) { ; CHECK-LABEL: setccole: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.le %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.le %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ole float %0, %1 ret i1 %3 @@ -93,10 +93,10 @@ define zeroext i1 @setccole(float, float) { define zeroext i1 @setccord(float, float) { ; CHECK-LABEL: setccord: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.num %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.num %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ord float %0, %1 ret i1 %3 @@ -105,10 +105,10 @@ define zeroext i1 @setccord(float, float) { define zeroext i1 @setccuno(float, float) { ; CHECK-LABEL: setccuno: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.nan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.nan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp uno float %0, %1 ret i1 %3 @@ -117,10 +117,10 @@ define zeroext i1 @setccuno(float, float) { define zeroext i1 @setccueq(float, float) { ; CHECK-LABEL: setccueq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.eqnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.eqnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ueq float %0, %1 ret i1 %3 @@ -129,10 +129,10 @@ define zeroext i1 @setccueq(float, float) { define zeroext i1 @setccune(float, float) { ; CHECK-LABEL: setccune: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.nenan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.nenan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp une float %0, %1 ret i1 %3 @@ -141,10 +141,10 @@ define zeroext i1 @setccune(float, float) { define zeroext i1 @setccugt(float, float) { ; CHECK-LABEL: setccugt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.gtnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.gtnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ugt float %0, %1 ret i1 %3 @@ -153,10 +153,10 @@ define zeroext i1 @setccugt(float, float) { define zeroext i1 @setccuge(float, float) { ; CHECK-LABEL: setccuge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.genan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.genan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp uge float %0, %1 ret i1 %3 @@ -165,10 +165,10 @@ define zeroext i1 @setccuge(float, float) { define zeroext i1 @setccult(float, float) { ; CHECK-LABEL: setccult: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.ltnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.ltnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ult float %0, %1 ret i1 %3 @@ -177,10 +177,10 @@ define zeroext i1 @setccult(float, float) { define zeroext i1 @setccule(float, float) { ; CHECK-LABEL: setccule: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.lenan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.lenan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ule float %0, %1 ret i1 %3 diff --git a/llvm/test/CodeGen/VE/setccf32i.ll b/llvm/test/CodeGen/VE/setccf32i.ll index 8f79219359f3c..3f90a103fec53 100644 --- a/llvm/test/CodeGen/VE/setccf32i.ll +++ b/llvm/test/CodeGen/VE/setccf32i.ll @@ -22,10 +22,10 @@ define zeroext i1 @setccoeq(float, float) { ; CHECK-LABEL: setccoeq: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.eq %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.eq %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp oeq float %0, 0.0 ret i1 %3 @@ -35,10 +35,10 @@ define zeroext i1 @setccone(float, float) { ; CHECK-LABEL: setccone: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.ne %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.ne %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp one float %0, 0.0 ret i1 %3 @@ -48,10 +48,10 @@ define zeroext i1 @setccogt(float, float) { ; CHECK-LABEL: setccogt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ogt float %0, 0.0 ret i1 %3 @@ -61,10 +61,10 @@ define zeroext i1 @setccoge(float, float) { ; CHECK-LABEL: setccoge: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.ge %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.ge %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp oge float %0, 0.0 ret i1 %3 @@ -74,10 +74,10 @@ define zeroext i1 @setccolt(float, float) { ; CHECK-LABEL: setccolt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp olt float %0, 0.0 ret i1 %3 @@ -87,10 +87,10 @@ define zeroext i1 @setccole(float, float) { ; CHECK-LABEL: setccole: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.le %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.le %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ole float %0, 0.0 ret i1 %3 @@ -99,10 +99,10 @@ define zeroext i1 @setccole(float, float) { define zeroext i1 @setccord(float, float) { ; CHECK-LABEL: setccord: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s0 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.num %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s0 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.num %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ord float %0, 0.0 ret i1 %3 @@ -111,10 +111,10 @@ define zeroext i1 @setccord(float, float) { define zeroext i1 @setccuno(float, float) { ; CHECK-LABEL: setccuno: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.s %s1, %s0, %s0 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.nan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s0 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.nan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp uno float %0, 0.0 ret i1 %3 @@ -124,10 +124,10 @@ define zeroext i1 @setccueq(float, float) { ; CHECK-LABEL: setccueq: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.eqnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.eqnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ueq float %0, 0.0 ret i1 %3 @@ -137,10 +137,10 @@ define zeroext i1 @setccune(float, float) { ; CHECK-LABEL: setccune: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.nenan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.nenan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp une float %0, 0.0 ret i1 %3 @@ -150,10 +150,10 @@ define zeroext i1 @setccugt(float, float) { ; CHECK-LABEL: setccugt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.gtnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.gtnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ugt float %0, 0.0 ret i1 %3 @@ -163,10 +163,10 @@ define zeroext i1 @setccuge(float, float) { ; CHECK-LABEL: setccuge: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.genan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.genan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp uge float %0, 0.0 ret i1 %3 @@ -176,10 +176,10 @@ define zeroext i1 @setccult(float, float) { ; CHECK-LABEL: setccult: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.ltnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.ltnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ult float %0, 0.0 ret i1 %3 @@ -189,10 +189,10 @@ define zeroext i1 @setccule(float, float) { ; CHECK-LABEL: setccule: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.s %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.s.lenan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.s %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.s.lenan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ule float %0, 0.0 ret i1 %3 diff --git a/llvm/test/CodeGen/VE/setccf64.ll b/llvm/test/CodeGen/VE/setccf64.ll index dca40e8231fa9..98c0e6c56bf41 100644 --- a/llvm/test/CodeGen/VE/setccf64.ll +++ b/llvm/test/CodeGen/VE/setccf64.ll @@ -21,10 +21,10 @@ define zeroext i1 @setccat(double, double) { define zeroext i1 @setccoeq(double, double) { ; CHECK-LABEL: setccoeq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.eq %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.eq %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp oeq double %0, %1 ret i1 %3 @@ -33,10 +33,10 @@ define zeroext i1 @setccoeq(double, double) { define zeroext i1 @setccone(double, double) { ; CHECK-LABEL: setccone: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.ne %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.ne %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp one double %0, %1 ret i1 %3 @@ -45,10 +45,10 @@ define zeroext i1 @setccone(double, double) { define zeroext i1 @setccogt(double, double) { ; CHECK-LABEL: setccogt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ogt double %0, %1 ret i1 %3 @@ -57,10 +57,10 @@ define zeroext i1 @setccogt(double, double) { define zeroext i1 @setccoge(double, double) { ; CHECK-LABEL: setccoge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.ge %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.ge %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp oge double %0, %1 ret i1 %3 @@ -69,10 +69,10 @@ define zeroext i1 @setccoge(double, double) { define zeroext i1 @setccolt(double, double) { ; CHECK-LABEL: setccolt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp olt double %0, %1 ret i1 %3 @@ -81,10 +81,10 @@ define zeroext i1 @setccolt(double, double) { define zeroext i1 @setccole(double, double) { ; CHECK-LABEL: setccole: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.le %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.le %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ole double %0, %1 ret i1 %3 @@ -93,10 +93,10 @@ define zeroext i1 @setccole(double, double) { define zeroext i1 @setccord(double, double) { ; CHECK-LABEL: setccord: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.num %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.num %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ord double %0, %1 ret i1 %3 @@ -105,10 +105,10 @@ define zeroext i1 @setccord(double, double) { define zeroext i1 @setccuno(double, double) { ; CHECK-LABEL: setccuno: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.nan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.nan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp uno double %0, %1 ret i1 %3 @@ -117,10 +117,10 @@ define zeroext i1 @setccuno(double, double) { define zeroext i1 @setccueq(double, double) { ; CHECK-LABEL: setccueq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.eqnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.eqnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ueq double %0, %1 ret i1 %3 @@ -129,10 +129,10 @@ define zeroext i1 @setccueq(double, double) { define zeroext i1 @setccune(double, double) { ; CHECK-LABEL: setccune: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.nenan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.nenan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp une double %0, %1 ret i1 %3 @@ -141,10 +141,10 @@ define zeroext i1 @setccune(double, double) { define zeroext i1 @setccugt(double, double) { ; CHECK-LABEL: setccugt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.gtnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.gtnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ugt double %0, %1 ret i1 %3 @@ -153,10 +153,10 @@ define zeroext i1 @setccugt(double, double) { define zeroext i1 @setccuge(double, double) { ; CHECK-LABEL: setccuge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.genan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.genan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp uge double %0, %1 ret i1 %3 @@ -165,10 +165,10 @@ define zeroext i1 @setccuge(double, double) { define zeroext i1 @setccult(double, double) { ; CHECK-LABEL: setccult: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.ltnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.ltnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ult double %0, %1 ret i1 %3 @@ -177,10 +177,10 @@ define zeroext i1 @setccult(double, double) { define zeroext i1 @setccule(double, double) { ; CHECK-LABEL: setccule: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.lenan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.lenan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ule double %0, %1 ret i1 %3 diff --git a/llvm/test/CodeGen/VE/setccf64i.ll b/llvm/test/CodeGen/VE/setccf64i.ll index 59af1b4103f6b..f3a8d2f35f6f7 100644 --- a/llvm/test/CodeGen/VE/setccf64i.ll +++ b/llvm/test/CodeGen/VE/setccf64i.ll @@ -22,10 +22,10 @@ define zeroext i1 @setccoeq(double, double) { ; CHECK-LABEL: setccoeq: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.eq %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.eq %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp oeq double %0, 0.0 ret i1 %3 @@ -35,10 +35,10 @@ define zeroext i1 @setccone(double, double) { ; CHECK-LABEL: setccone: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.ne %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.ne %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp one double %0, 0.0 ret i1 %3 @@ -48,10 +48,10 @@ define zeroext i1 @setccogt(double, double) { ; CHECK-LABEL: setccogt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ogt double %0, 0.0 ret i1 %3 @@ -61,10 +61,10 @@ define zeroext i1 @setccoge(double, double) { ; CHECK-LABEL: setccoge: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.ge %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.ge %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp oge double %0, 0.0 ret i1 %3 @@ -74,10 +74,10 @@ define zeroext i1 @setccolt(double, double) { ; CHECK-LABEL: setccolt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp olt double %0, 0.0 ret i1 %3 @@ -87,10 +87,10 @@ define zeroext i1 @setccole(double, double) { ; CHECK-LABEL: setccole: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.le %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.le %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ole double %0, 0.0 ret i1 %3 @@ -99,10 +99,10 @@ define zeroext i1 @setccole(double, double) { define zeroext i1 @setccord(double, double) { ; CHECK-LABEL: setccord: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s0 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.num %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s0 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.num %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ord double %0, 0.0 ret i1 %3 @@ -111,10 +111,10 @@ define zeroext i1 @setccord(double, double) { define zeroext i1 @setccuno(double, double) { ; CHECK-LABEL: setccuno: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: fcmp.d %s1, %s0, %s0 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.nan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s0 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.nan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp uno double %0, 0.0 ret i1 %3 @@ -124,10 +124,10 @@ define zeroext i1 @setccueq(double, double) { ; CHECK-LABEL: setccueq: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.eqnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.eqnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ueq double %0, 0.0 ret i1 %3 @@ -137,10 +137,10 @@ define zeroext i1 @setccune(double, double) { ; CHECK-LABEL: setccune: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.nenan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.nenan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp une double %0, 0.0 ret i1 %3 @@ -150,10 +150,10 @@ define zeroext i1 @setccugt(double, double) { ; CHECK-LABEL: setccugt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.gtnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.gtnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ugt double %0, 0.0 ret i1 %3 @@ -163,10 +163,10 @@ define zeroext i1 @setccuge(double, double) { ; CHECK-LABEL: setccuge: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.genan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.genan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp uge double %0, 0.0 ret i1 %3 @@ -176,10 +176,10 @@ define zeroext i1 @setccult(double, double) { ; CHECK-LABEL: setccult: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.ltnan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.ltnan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ult double %0, 0.0 ret i1 %3 @@ -189,10 +189,10 @@ define zeroext i1 @setccule(double, double) { ; CHECK-LABEL: setccule: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: lea.sl %s1, 0 -; CHECK-NEXT: fcmp.d %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.d.lenan %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: fcmp.d %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.d.lenan %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = fcmp ule double %0, 0.0 ret i1 %3 diff --git a/llvm/test/CodeGen/VE/setcci32.ll b/llvm/test/CodeGen/VE/setcci32.ll index 7e92a2c7f5ede..15bf130fec8b4 100644 --- a/llvm/test/CodeGen/VE/setcci32.ll +++ b/llvm/test/CodeGen/VE/setcci32.ll @@ -3,10 +3,12 @@ define zeroext i1 @setcceq(i32, i32) { ; CHECK-LABEL: setcceq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp eq i32 %0, %1 ret i1 %3 @@ -15,10 +17,12 @@ define zeroext i1 @setcceq(i32, i32) { define zeroext i1 @setccne(i32, i32) { ; CHECK-LABEL: setccne: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.ne %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.ne %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ne i32 %0, %1 ret i1 %3 @@ -27,10 +31,12 @@ define zeroext i1 @setccne(i32, i32) { define zeroext i1 @setccugt(i32, i32) { ; CHECK-LABEL: setccugt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmpu.w %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmpu.w %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ugt i32 %0, %1 ret i1 %3 @@ -39,10 +45,12 @@ define zeroext i1 @setccugt(i32, i32) { define zeroext i1 @setccuge(i32, i32) { ; CHECK-LABEL: setccuge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmpu.w %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.ge %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmpu.w %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.ge %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp uge i32 %0, %1 ret i1 %3 @@ -51,10 +59,12 @@ define zeroext i1 @setccuge(i32, i32) { define zeroext i1 @setccult(i32, i32) { ; CHECK-LABEL: setccult: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmpu.w %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmpu.w %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ult i32 %0, %1 ret i1 %3 @@ -63,10 +73,12 @@ define zeroext i1 @setccult(i32, i32) { define zeroext i1 @setccule(i32, i32) { ; CHECK-LABEL: setccule: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmpu.w %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.le %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmpu.w %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.le %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ule i32 %0, %1 ret i1 %3 @@ -75,10 +87,12 @@ define zeroext i1 @setccule(i32, i32) { define zeroext i1 @setccsgt(i32, i32) { ; CHECK-LABEL: setccsgt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sgt i32 %0, %1 ret i1 %3 @@ -87,10 +101,12 @@ define zeroext i1 @setccsgt(i32, i32) { define zeroext i1 @setccsge(i32, i32) { ; CHECK-LABEL: setccsge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.ge %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.ge %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sge i32 %0, %1 ret i1 %3 @@ -99,10 +115,12 @@ define zeroext i1 @setccsge(i32, i32) { define zeroext i1 @setccslt(i32, i32) { ; CHECK-LABEL: setccslt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp slt i32 %0, %1 ret i1 %3 @@ -111,10 +129,12 @@ define zeroext i1 @setccslt(i32, i32) { define zeroext i1 @setccsle(i32, i32) { ; CHECK-LABEL: setccsle: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.le %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.le %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sle i32 %0, %1 ret i1 %3 diff --git a/llvm/test/CodeGen/VE/setcci32i.ll b/llvm/test/CodeGen/VE/setcci32i.ll index c0d1fb0a67e4a..ac226190ae44f 100644 --- a/llvm/test/CodeGen/VE/setcci32i.ll +++ b/llvm/test/CodeGen/VE/setcci32i.ll @@ -3,11 +3,12 @@ define zeroext i1 @setcceq(i32, i32) { ; CHECK-LABEL: setcceq: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp eq i32 %0, 12 ret i1 %3 @@ -16,11 +17,12 @@ define zeroext i1 @setcceq(i32, i32) { define zeroext i1 @setccne(i32, i32) { ; CHECK-LABEL: setccne: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.ne %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.ne %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ne i32 %0, 12 ret i1 %3 @@ -29,11 +31,12 @@ define zeroext i1 @setccne(i32, i32) { define zeroext i1 @setccugt(i32, i32) { ; CHECK-LABEL: setccugt: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmpu.w %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.w %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ugt i32 %0, 12 ret i1 %3 @@ -42,11 +45,12 @@ define zeroext i1 @setccugt(i32, i32) { define zeroext i1 @setccuge(i32, i32) { ; CHECK-LABEL: setccuge: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 11, (0)1 -; CHECK-NEXT: cmpu.w %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.w %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp uge i32 %0, 12 ret i1 %3 @@ -55,11 +59,12 @@ define zeroext i1 @setccuge(i32, i32) { define zeroext i1 @setccult(i32, i32) { ; CHECK-LABEL: setccult: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmpu.w %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.w %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ult i32 %0, 12 ret i1 %3 @@ -68,11 +73,12 @@ define zeroext i1 @setccult(i32, i32) { define zeroext i1 @setccule(i32, i32) { ; CHECK-LABEL: setccule: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 13, (0)1 -; CHECK-NEXT: cmpu.w %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.w %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ule i32 %0, 12 ret i1 %3 @@ -81,11 +87,12 @@ define zeroext i1 @setccule(i32, i32) { define zeroext i1 @setccsgt(i32, i32) { ; CHECK-LABEL: setccsgt: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sgt i32 %0, 12 ret i1 %3 @@ -94,11 +101,12 @@ define zeroext i1 @setccsgt(i32, i32) { define zeroext i1 @setccsge(i32, i32) { ; CHECK-LABEL: setccsge: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 11, (0)1 -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sge i32 %0, 12 ret i1 %3 @@ -107,11 +115,12 @@ define zeroext i1 @setccsge(i32, i32) { define zeroext i1 @setccslt(i32, i32) { ; CHECK-LABEL: setccslt: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp slt i32 %0, 12 ret i1 %3 @@ -120,11 +129,12 @@ define zeroext i1 @setccslt(i32, i32) { define zeroext i1 @setccsle(i32, i32) { ; CHECK-LABEL: setccsle: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s1, 13, (0)1 -; CHECK-NEXT: cmps.w.sx %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.w.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.w.sx %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sle i32 %0, 12 ret i1 %3 diff --git a/llvm/test/CodeGen/VE/setcci64.ll b/llvm/test/CodeGen/VE/setcci64.ll index 8b86601594da9..5cae80a60f06d 100644 --- a/llvm/test/CodeGen/VE/setcci64.ll +++ b/llvm/test/CodeGen/VE/setcci64.ll @@ -3,10 +3,10 @@ define zeroext i1 @setcceq(i64, i64) { ; CHECK-LABEL: setcceq: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.eq %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.eq %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp eq i64 %0, %1 ret i1 %3 @@ -15,10 +15,10 @@ define zeroext i1 @setcceq(i64, i64) { define zeroext i1 @setccne(i64, i64) { ; CHECK-LABEL: setccne: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.ne %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.ne %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ne i64 %0, %1 ret i1 %3 @@ -27,10 +27,10 @@ define zeroext i1 @setccne(i64, i64) { define zeroext i1 @setccugt(i64, i64) { ; CHECK-LABEL: setccugt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmpu.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ugt i64 %0, %1 ret i1 %3 @@ -39,10 +39,10 @@ define zeroext i1 @setccugt(i64, i64) { define zeroext i1 @setccuge(i64, i64) { ; CHECK-LABEL: setccuge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmpu.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.ge %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.ge %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp uge i64 %0, %1 ret i1 %3 @@ -51,10 +51,10 @@ define zeroext i1 @setccuge(i64, i64) { define zeroext i1 @setccult(i64, i64) { ; CHECK-LABEL: setccult: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmpu.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ult i64 %0, %1 ret i1 %3 @@ -63,10 +63,10 @@ define zeroext i1 @setccult(i64, i64) { define zeroext i1 @setccule(i64, i64) { ; CHECK-LABEL: setccule: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmpu.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.le %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.le %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ule i64 %0, %1 ret i1 %3 @@ -75,10 +75,10 @@ define zeroext i1 @setccule(i64, i64) { define zeroext i1 @setccsgt(i64, i64) { ; CHECK-LABEL: setccsgt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sgt i64 %0, %1 ret i1 %3 @@ -87,10 +87,10 @@ define zeroext i1 @setccsgt(i64, i64) { define zeroext i1 @setccsge(i64, i64) { ; CHECK-LABEL: setccsge: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.ge %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.ge %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sge i64 %0, %1 ret i1 %3 @@ -99,10 +99,10 @@ define zeroext i1 @setccsge(i64, i64) { define zeroext i1 @setccslt(i64, i64) { ; CHECK-LABEL: setccslt: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp slt i64 %0, %1 ret i1 %3 @@ -111,10 +111,10 @@ define zeroext i1 @setccslt(i64, i64) { define zeroext i1 @setccsle(i64, i64) { ; CHECK-LABEL: setccsle: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.le %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.le %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sle i64 %0, %1 ret i1 %3 diff --git a/llvm/test/CodeGen/VE/setcci64i.ll b/llvm/test/CodeGen/VE/setcci64i.ll index aecbe40b0a34d..c73db4c72276d 100644 --- a/llvm/test/CodeGen/VE/setcci64i.ll +++ b/llvm/test/CodeGen/VE/setcci64i.ll @@ -4,10 +4,10 @@ define zeroext i1 @setcceq(i64, i64) { ; CHECK-LABEL: setcceq: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.eq %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.eq %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp eq i64 %0, 12 ret i1 %3 @@ -17,10 +17,10 @@ define zeroext i1 @setccne(i64, i64) { ; CHECK-LABEL: setccne: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.ne %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.ne %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ne i64 %0, 12 ret i1 %3 @@ -30,10 +30,10 @@ define zeroext i1 @setccugt(i64, i64) { ; CHECK-LABEL: setccugt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmpu.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ugt i64 %0, 12 ret i1 %3 @@ -43,10 +43,10 @@ define zeroext i1 @setccuge(i64, i64) { ; CHECK-LABEL: setccuge: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 11, (0)1 -; CHECK-NEXT: cmpu.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp uge i64 %0, 12 ret i1 %3 @@ -56,10 +56,10 @@ define zeroext i1 @setccult(i64, i64) { ; CHECK-LABEL: setccult: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmpu.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ult i64 %0, 12 ret i1 %3 @@ -69,10 +69,10 @@ define zeroext i1 @setccule(i64, i64) { ; CHECK-LABEL: setccule: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 13, (0)1 -; CHECK-NEXT: cmpu.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmpu.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp ule i64 %0, 12 ret i1 %3 @@ -82,10 +82,10 @@ define zeroext i1 @setccsgt(i64, i64) { ; CHECK-LABEL: setccsgt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sgt i64 %0, 12 ret i1 %3 @@ -95,10 +95,10 @@ define zeroext i1 @setccsge(i64, i64) { ; CHECK-LABEL: setccsge: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 11, (0)1 -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.gt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.gt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sge i64 %0, 12 ret i1 %3 @@ -108,10 +108,10 @@ define zeroext i1 @setccslt(i64, i64) { ; CHECK-LABEL: setccslt: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 12, (0)1 -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp slt i64 %0, 12 ret i1 %3 @@ -121,10 +121,10 @@ define zeroext i1 @setccsle(i64, i64) { ; CHECK-LABEL: setccsle: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: or %s1, 13, (0)1 -; CHECK-NEXT: cmps.l %s1, %s0, %s1 -; CHECK-NEXT: or %s0, 0, (0)1 -; CHECK-NEXT: cmov.l.lt %s0, (63)0, %s1 -; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: cmps.l %s0, %s0, %s1 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.l.lt %s1, (63)0, %s0 +; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = icmp sle i64 %0, 12 ret i1 %3 diff --git a/llvm/test/CodeGen/VE/sext_zext_load.ll b/llvm/test/CodeGen/VE/sext_zext_load.ll index b9fc6bc4daf74..600a02a5b1303 100644 --- a/llvm/test/CodeGen/VE/sext_zext_load.ll +++ b/llvm/test/CodeGen/VE/sext_zext_load.ll @@ -267,8 +267,8 @@ define signext i8 @func37() { ; CHECK-LABEL: func37: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: ld1b.zx %s0, 191(, %s11) -; CHECK-NEXT: sla.w.sx %s0, %s0, 31 -; CHECK-NEXT: sra.w.sx %s0, %s0, 31 +; CHECK-NEXT: sll %s0, %s0, 63 +; CHECK-NEXT: sra.l %s0, %s0, 63 ; CHECK-NEXT: or %s11, 0, %s9 %a = alloca i1, align 1 %a.val = load i1, i1* %a, align 1 @@ -280,8 +280,8 @@ define signext i16 @func38() { ; CHECK-LABEL: func38: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: ld1b.zx %s0, 191(, %s11) -; CHECK-NEXT: sla.w.sx %s0, %s0, 31 -; CHECK-NEXT: sra.w.sx %s0, %s0, 31 +; CHECK-NEXT: sll %s0, %s0, 63 +; CHECK-NEXT: sra.l %s0, %s0, 63 ; CHECK-NEXT: or %s11, 0, %s9 %a = alloca i1, align 1 %a.val = load i1, i1* %a, align 1 @@ -293,8 +293,8 @@ define signext i32 @func39() { ; CHECK-LABEL: func39: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: ld1b.zx %s0, 191(, %s11) -; CHECK-NEXT: sla.w.sx %s0, %s0, 31 -; CHECK-NEXT: sra.w.sx %s0, %s0, 31 +; CHECK-NEXT: sll %s0, %s0, 63 +; CHECK-NEXT: sra.l %s0, %s0, 63 ; CHECK-NEXT: or %s11, 0, %s9 %a = alloca i1, align 1 %a.val = load i1, i1* %a, align 1 diff --git a/llvm/test/CodeGen/VE/subtraction.ll b/llvm/test/CodeGen/VE/subtraction.ll index 1bd85d4290929..43a30bfe1e1b7 100644 --- a/llvm/test/CodeGen/VE/subtraction.ll +++ b/llvm/test/CodeGen/VE/subtraction.ll @@ -3,9 +3,11 @@ define signext i8 @func1(i8 signext %0, i8 signext %1) { ; CHECK-LABEL: func1: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %3 = sub i8 %0, %1 ret i8 %3 @@ -14,9 +16,11 @@ define signext i8 @func1(i8 signext %0, i8 signext %1) { define signext i16 @func2(i16 signext %0, i16 signext %1) { ; CHECK-LABEL: func2: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %3 = sub i16 %0, %1 ret i16 %3 @@ -25,6 +29,8 @@ define signext i16 @func2(i16 signext %0, i16 signext %1) { define i32 @func3(i32 %0, i32 %1) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = sub nsw i32 %0, %1 @@ -43,6 +49,8 @@ define i64 @func4(i64 %0, i64 %1) { define zeroext i8 @func6(i8 zeroext %0, i8 zeroext %1) { ; CHECK-LABEL: func6: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 ; CHECK-NEXT: and %s0, %s0, (56)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -53,6 +61,8 @@ define zeroext i8 @func6(i8 zeroext %0, i8 zeroext %1) { define zeroext i16 @func7(i16 zeroext %0, i16 zeroext %1) { ; CHECK-LABEL: func7: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 ; CHECK-NEXT: and %s0, %s0, (48)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -63,6 +73,8 @@ define zeroext i16 @func7(i16 zeroext %0, i16 zeroext %1) { define i32 @func8(i32 %0, i32 %1) { ; CHECK-LABEL: func8: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s1, %s1, (0)1 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: subs.w.sx %s0, %s0, %s1 ; CHECK-NEXT: or %s11, 0, %s9 %3 = sub i32 %0, %1 @@ -81,9 +93,10 @@ define i64 @func9(i64 %0, i64 %1) { define signext i8 @func13(i8 signext %0, i8 signext %1) { ; CHECK-LABEL: func13: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, -5, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 24 -; CHECK-NEXT: sra.w.sx %s0, %s0, 24 +; CHECK-NEXT: sll %s0, %s0, 56 +; CHECK-NEXT: sra.l %s0, %s0, 56 ; CHECK-NEXT: or %s11, 0, %s9 %3 = add i8 %0, -5 ret i8 %3 @@ -92,9 +105,10 @@ define signext i8 @func13(i8 signext %0, i8 signext %1) { define signext i16 @func14(i16 signext %0, i16 signext %1) { ; CHECK-LABEL: func14: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, -5, %s0 -; CHECK-NEXT: sla.w.sx %s0, %s0, 16 -; CHECK-NEXT: sra.w.sx %s0, %s0, 16 +; CHECK-NEXT: sll %s0, %s0, 48 +; CHECK-NEXT: sra.l %s0, %s0, 48 ; CHECK-NEXT: or %s11, 0, %s9 %3 = add i16 %0, -5 ret i16 %3 @@ -103,6 +117,7 @@ define signext i16 @func14(i16 signext %0, i16 signext %1) { define i32 @func15(i32 %0, i32 %1) { ; CHECK-LABEL: func15: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, -5, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %3 = add nsw i32 %0, -5 @@ -121,6 +136,7 @@ define i64 @func16(i64 %0, i64 %1) { define zeroext i8 @func18(i8 zeroext %0, i8 zeroext %1) { ; CHECK-LABEL: func18: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, -5, %s0 ; CHECK-NEXT: and %s0, %s0, (56)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -131,6 +147,7 @@ define zeroext i8 @func18(i8 zeroext %0, i8 zeroext %1) { define zeroext i16 @func19(i16 zeroext %0, i16 zeroext %1) { ; CHECK-LABEL: func19: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, -5, %s0 ; CHECK-NEXT: and %s0, %s0, (48)0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -141,6 +158,7 @@ define zeroext i16 @func19(i16 zeroext %0, i16 zeroext %1) { define i32 @func20(i32 %0, i32 %1) { ; CHECK-LABEL: func20: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: adds.w.sx %s0, -5, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %3 = add i32 %0, -5 diff --git a/llvm/test/CodeGen/VE/truncstore.ll b/llvm/test/CodeGen/VE/truncstore.ll index 357cc6b117914..97a4da4cd93be 100644 --- a/llvm/test/CodeGen/VE/truncstore.ll +++ b/llvm/test/CodeGen/VE/truncstore.ll @@ -33,7 +33,6 @@ define void @func2(i8 signext %p, i32* %a) { define void @func3(i8 signext %p, i64* %a) { ; CHECK-LABEL: func3: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: st %s0, (, %s1) ; CHECK-NEXT: or %s11, 0, %s9 %p.conv = sext i8 %p to i64 @@ -54,7 +53,6 @@ define void @func5(i16 signext %p, i32* %a) { define void @func6(i16 signext %p, i64* %a) { ; CHECK-LABEL: func6: ; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: st %s0, (, %s1) ; CHECK-NEXT: or %s11, 0, %s9 %p.conv = sext i16 %p to i64 diff --git a/llvm/test/CodeGen/VE/va_caller.ll b/llvm/test/CodeGen/VE/va_caller.ll index 345ab80867f1f..b43ce999c589d 100644 --- a/llvm/test/CodeGen/VE/va_caller.ll +++ b/llvm/test/CodeGen/VE/va_caller.ll @@ -6,38 +6,36 @@ define i32 @caller() { ; CHECK-LABEL: caller: ; CHECK: .LBB{{[0-9]+}}_2: ; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill -; CHECK-NEXT: or %s7, 0, (0)1 -; CHECK-NEXT: st %s7, 280(, %s11) +; CHECK-NEXT: or %s18, 0, (0)1 +; CHECK-NEXT: st %s18, 280(, %s11) ; CHECK-NEXT: or %s0, 11, (0)1 ; CHECK-NEXT: st %s0, 272(, %s11) -; CHECK-NEXT: st %s7, 264(, %s11) +; CHECK-NEXT: st %s18, 264(, %s11) ; CHECK-NEXT: or %s0, 10, (0)1 ; CHECK-NEXT: st %s0, 256(, %s11) ; CHECK-NEXT: lea.sl %s0, 1075970048 ; CHECK-NEXT: st %s0, 248(, %s11) ; CHECK-NEXT: or %s0, 8, (0)1 ; CHECK-NEXT: st %s0, 240(, %s11) -; CHECK-NEXT: st %s7, 232(, %s11) -; CHECK-NEXT: lea %s0, 1086324736 -; CHECK-NEXT: stl %s0, 228(, %s11) +; CHECK-NEXT: st %s18, 232(, %s11) ; CHECK-NEXT: or %s5, 5, (0)1 -; CHECK-NEXT: stl %s5, 216(, %s11) +; CHECK-NEXT: st %s5, 216(, %s11) ; CHECK-NEXT: or %s4, 4, (0)1 -; CHECK-NEXT: stl %s4, 208(, %s11) +; CHECK-NEXT: st %s4, 208(, %s11) ; CHECK-NEXT: or %s3, 3, (0)1 -; CHECK-NEXT: stl %s3, 200(, %s11) +; CHECK-NEXT: st %s3, 200(, %s11) ; CHECK-NEXT: or %s2, 2, (0)1 -; CHECK-NEXT: stl %s2, 192(, %s11) +; CHECK-NEXT: st %s2, 192(, %s11) ; CHECK-NEXT: or %s1, 1, (0)1 -; CHECK-NEXT: stl %s1, 184(, %s11) -; CHECK-NEXT: or %s18, 0, (0)1 +; CHECK-NEXT: st %s1, 184(, %s11) +; CHECK-NEXT: st %s18, 176(, %s11) +; CHECK-NEXT: lea.sl %s6, 1086324736 ; CHECK-NEXT: lea %s0, func@lo ; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: lea.sl %s12, func@hi(, %s0) -; CHECK-NEXT: lea.sl %s6, 1086324736 -; CHECK-NEXT: stl %s18, 176(, %s11) +; CHECK-NEXT: st %s6, 224(, %s11) ; CHECK-NEXT: or %s0, 0, %s18 -; CHECK-NEXT: # kill: def $sf6 killed $sf6 killed $sx6 +; CHECK-NEXT: or %s7, 0, %s18 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s0, 0, %s18 ; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/VE/xor.ll b/llvm/test/CodeGen/VE/xor.ll index d1701f6ef5017..b3336bb72ff5a 100644 --- a/llvm/test/CodeGen/VE/xor.ll +++ b/llvm/test/CodeGen/VE/xor.ll @@ -21,7 +21,9 @@ define zeroext i8 @func8z(i8 zeroext %a, i8 zeroext %b) { define signext i8 @funci8s(i8 signext %a) { ; CHECK-LABEL: funci8s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, 5, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = xor i8 %a, 5 ret i8 %res @@ -30,8 +32,10 @@ define signext i8 @funci8s(i8 signext %a) { define zeroext i8 @funci8z(i8 zeroext %a) { ; CHECK-LABEL: funci8z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: lea %s1, 251 ; CHECK-NEXT: xor %s0, %s0, %s1 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = xor i8 -5, %a ret i8 %res @@ -58,7 +62,9 @@ define zeroext i16 @func16z(i16 zeroext %a, i16 zeroext %b) { define signext i16 @funci16s(i16 signext %a) { ; CHECK-LABEL: funci16s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, -1, %s0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = xor i16 %a, 65535 ret i16 %res @@ -67,7 +73,9 @@ define signext i16 @funci16s(i16 signext %a) { define zeroext i16 @funci16z(i16 zeroext %a) { ; CHECK-LABEL: funci16z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, %s0, (52)0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = xor i16 4095, %a ret i16 %res @@ -94,7 +102,9 @@ define zeroext i32 @func32z(i32 zeroext %a, i32 zeroext %b) { define signext i32 @funci32s(i32 signext %a) { ; CHECK-LABEL: funci32s: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, %s0, (36)0 +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = xor i32 %a, 268435455 ret i32 %res @@ -103,7 +113,9 @@ define signext i32 @funci32s(i32 signext %a) { define zeroext i32 @funci32z(i32 zeroext %a) { ; CHECK-LABEL: funci32z: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, %s0, (36)0 +; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1 ; CHECK-NEXT: or %s11, 0, %s9 %res = xor i32 %a, 268435455 ret i32 %res @@ -112,6 +124,7 @@ define zeroext i32 @funci32z(i32 zeroext %a) { define i32 @funci32_another(i32 %0) { ; CHECK-LABEL: funci32_another: ; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: adds.w.sx %s0, %s0, (0)1 ; CHECK-NEXT: xor %s0, %s0, (33)1 ; CHECK-NEXT: or %s11, 0, %s9 %2 = xor i32 %0, -2147483648 diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll index 6c7b8f9cdf8af..887dc470b3bc8 100644 --- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll +++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll @@ -975,6 +975,54 @@ try.cont: ; preds = %catch.start, %for.e ret void } +; Here an exception is semantically contained in a loop. 'ehcleanup' BB belongs +; to the exception, but does not belong to the loop (because it does not have a +; path back to the loop header), and is placed after the loop latch block +; 'invoke.cont' intentionally. This tests if 'end_loop' marker is placed +; correctly not right after 'invoke.cont' part but after 'ehcleanup' part, +; NOSORT-LABEL: test18 +; NOSORT: loop +; NOSORT: try +; NOSORT: end_try +; NOSORT: end_loop +define void @test18(i32 %n) personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) { +entry: + br label %while.cond + +while.cond: ; preds = %invoke.cont, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %invoke.cont ] + %tobool = icmp ne i32 %n.addr.0, 0 + br i1 %tobool, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %dec = add nsw i32 %n.addr.0, -1 + invoke void @foo() + to label %while.end unwind label %catch.dispatch + +catch.dispatch: ; preds = %while.body + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [i8* null] + %2 = call i8* @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + %4 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ] + invoke void @__cxa_end_catch() [ "funclet"(token %1) ] + to label %invoke.cont unwind label %ehcleanup + +invoke.cont: ; preds = %catch.start + catchret from %1 to label %while.cond + +ehcleanup: ; preds = %catch.start + %5 = cleanuppad within %1 [] + %6 = call i8* @llvm.wasm.get.exception(token %5) + call void @__clang_call_terminate(i8* %6) [ "funclet"(token %5) ] + unreachable + +while.end: ; preds = %while.body, %while.cond + ret void +} + ; Check if the unwind destination mismatch stats are correct ; NOSORT-STAT: 17 wasm-cfg-stackify - Number of EH pad unwind mismatches found diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll index 05d256fa5524c..63092a8c51786 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -294,46 +294,6 @@ define <8 x i16> @narrow_unsigned_v8i16(<4 x i32> %low, <4 x i32> %high) { ret <8 x i16> %a } -; CHECK-LABEL: widen_low_signed_v8i16: -; SIMD128-NEXT: .functype widen_low_signed_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.widen_low_i8x16_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.widen.low.signed.v8i16.v16i8(<16 x i8>) -define <8 x i16> @widen_low_signed_v8i16(<16 x i8> %v) { - %a = call <8 x i16> @llvm.wasm.widen.low.signed.v8i16.v16i8(<16 x i8> %v) - ret <8 x i16> %a -} - -; CHECK-LABEL: widen_high_signed_v8i16: -; SIMD128-NEXT: .functype widen_high_signed_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.widen_high_i8x16_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.widen.high.signed.v8i16.v16i8(<16 x i8>) -define <8 x i16> @widen_high_signed_v8i16(<16 x i8> %v) { - %a = call <8 x i16> @llvm.wasm.widen.high.signed.v8i16.v16i8(<16 x i8> %v) - ret <8 x i16> %a -} - -; CHECK-LABEL: widen_low_unsigned_v8i16: -; SIMD128-NEXT: .functype widen_low_unsigned_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.widen_low_i8x16_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.widen.low.unsigned.v8i16.v16i8(<16 x i8>) -define <8 x i16> @widen_low_unsigned_v8i16(<16 x i8> %v) { - %a = call <8 x i16> @llvm.wasm.widen.low.unsigned.v8i16.v16i8(<16 x i8> %v) - ret <8 x i16> %a -} - -; CHECK-LABEL: widen_high_unsigned_v8i16: -; SIMD128-NEXT: .functype widen_high_unsigned_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.widen_high_i8x16_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.widen.high.unsigned.v8i16.v16i8(<16 x i8>) -define <8 x i16> @widen_high_unsigned_v8i16(<16 x i8> %v) { - %a = call <8 x i16> @llvm.wasm.widen.high.unsigned.v8i16.v16i8(<16 x i8> %v) - ret <8 x i16> %a -} - ; ============================================================================== ; 4 x i32 ; ============================================================================== @@ -411,46 +371,6 @@ define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) { ret <4 x i32> %a } -; CHECK-LABEL: widen_low_signed_v4i32: -; SIMD128-NEXT: .functype widen_low_signed_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_low_i16x8_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.low.signed.v4i32.v8i16(<8 x i16>) -define <4 x i32> @widen_low_signed_v4i32(<8 x i16> %v) { - %a = call <4 x i32> @llvm.wasm.widen.low.signed.v4i32.v8i16(<8 x i16> %v) - ret <4 x i32> %a -} - -; CHECK-LABEL: widen_high_signed_v4i32: -; SIMD128-NEXT: .functype widen_high_signed_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_high_i16x8_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.high.signed.v4i32.v8i16(<8 x i16>) -define <4 x i32> @widen_high_signed_v4i32(<8 x i16> %v) { - %a = call <4 x i32> @llvm.wasm.widen.high.signed.v4i32.v8i16(<8 x i16> %v) - ret <4 x i32> %a -} - -; CHECK-LABEL: widen_low_unsigned_v4i32: -; SIMD128-NEXT: .functype widen_low_unsigned_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_low_i16x8_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.low.unsigned.v4i32.v8i16(<8 x i16>) -define <4 x i32> @widen_low_unsigned_v4i32(<8 x i16> %v) { - %a = call <4 x i32> @llvm.wasm.widen.low.unsigned.v4i32.v8i16(<8 x i16> %v) - ret <4 x i32> %a -} - -; CHECK-LABEL: widen_high_unsigned_v4i32: -; SIMD128-NEXT: .functype widen_high_unsigned_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_high_i16x8_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.high.unsigned.v4i32.v8i16(<8 x i16>) -define <4 x i32> @widen_high_unsigned_v4i32(<8 x i16> %v) { - %a = call <4 x i32> @llvm.wasm.widen.high.unsigned.v4i32.v8i16(<8 x i16> %v) - ret <4 x i32> %a -} - ; ============================================================================== ; 2 x i64 ; ============================================================================== diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll new file mode 100644 index 0000000000000..ab3643653deb4 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +; Test SIMD v128.load{32,64}_zero instructions + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +declare <4 x i32> @llvm.wasm.load32.zero(i32*) +declare <2 x i64> @llvm.wasm.load64.zero(i64*) + +;===---------------------------------------------------------------------------- +; v128.load32_zero +;===---------------------------------------------------------------------------- + +define <4 x i32> @load_zero_i32_no_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_no_offset: +; CHECK: .functype load_zero_i32_no_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: # fallthrough-return + %v = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %p) + ret <4 x i32> %v +} + +define <4 x i32> @load_zero_i32_with_folded_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_folded_offset: +; CHECK: .functype load_zero_i32_with_folded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 24 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint i32* %p to i32 + %r = add nuw i32 %q, 24 + %s = inttoptr i32 %r to i32* + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_with_folded_gep_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_folded_gep_offset: +; CHECK: .functype load_zero_i32_with_folded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load32_zero 24 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds i32, i32* %p, i32 6 + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_with_unfolded_gep_negative_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_unfolded_gep_negative_offset: +; CHECK: .functype load_zero_i32_with_unfolded_gep_negative_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const -24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds i32, i32* %p, i32 -6 + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_with_unfolded_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_unfolded_offset: +; CHECK: .functype load_zero_i32_with_unfolded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint i32* %p to i32 + %r = add nsw i32 %q, 24 + %s = inttoptr i32 %r to i32* + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_with_unfolded_gep_offset(i32* %p) { +; CHECK-LABEL: load_zero_i32_with_unfolded_gep_offset: +; CHECK: .functype load_zero_i32_with_unfolded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load32_zero 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr i32, i32* %p, i32 6 + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +define <4 x i32> @load_zero_i32_from_numeric_address() { +; CHECK-LABEL: load_zero_i32_from_numeric_address: +; CHECK: .functype load_zero_i32_from_numeric_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load32_zero 42 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 42 to i32* + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s) + ret <4 x i32> %t +} + +@gv_i32 = global i32 0 +define <4 x i32> @load_zero_i32_from_global_address() { +; CHECK-LABEL: load_zero_i32_from_global_address: +; CHECK: .functype load_zero_i32_from_global_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load32_zero gv_i32 +; CHECK-NEXT: # fallthrough-return + %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* @gv_i32) + ret <4 x i32> %t +} + +;===---------------------------------------------------------------------------- +; v128.load64_zero +;===---------------------------------------------------------------------------- + +define <2 x i64> @load_zero_i64_no_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_no_offset: +; CHECK: .functype load_zero_i64_no_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: # fallthrough-return + %v = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %p) + ret <2 x i64> %v +} + +define <2 x i64> @load_zero_i64_with_folded_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_folded_offset: +; CHECK: .functype load_zero_i64_with_folded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_zero 24 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint i64* %p to i32 + %r = add nuw i32 %q, 24 + %s = inttoptr i32 %r to i64* + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_with_folded_gep_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_folded_gep_offset: +; CHECK: .functype load_zero_i64_with_folded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_zero 48 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds i64, i64* %p, i64 6 + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_with_unfolded_gep_negative_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_unfolded_gep_negative_offset: +; CHECK: .functype load_zero_i64_with_unfolded_gep_negative_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const -48 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds i64, i64* %p, i64 -6 + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_with_unfolded_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_unfolded_offset: +; CHECK: .functype load_zero_i64_with_unfolded_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint i64* %p to i32 + %r = add nsw i32 %q, 24 + %s = inttoptr i32 %r to i64* + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_with_unfolded_gep_offset(i64* %p) { +; CHECK-LABEL: load_zero_i64_with_unfolded_gep_offset: +; CHECK: .functype load_zero_i64_with_unfolded_gep_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 48 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_zero 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr i64, i64* %p, i64 6 + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +define <2 x i64> @load_zero_i64_from_numeric_address() { +; CHECK-LABEL: load_zero_i64_from_numeric_address: +; CHECK: .functype load_zero_i64_from_numeric_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load64_zero 42 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 42 to i64* + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s) + ret <2 x i64> %t +} + +@gv_i64 = global i64 0 +define <2 x i64> @load_zero_i64_from_global_address() { +; CHECK-LABEL: load_zero_i64_from_global_address: +; CHECK: .functype load_zero_i64_from_global_address () -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: v128.load64_zero gv_i64 +; CHECK-NEXT: # fallthrough-return + %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* @gv_i64) + ret <2 x i64> %t +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll index 8d39ddc20f244..b2d32936df130 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll @@ -918,6 +918,24 @@ define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16: +; CHECK: .functype store_narrowing_v8i16 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + store <8 x i8> %v, <8 x i8>* %p + ret void +} + define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_folded_offset: ; CHECK: .functype store_v8i16_with_folded_offset (v128, i32) -> () @@ -933,6 +951,27 @@ define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16_with_folded_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_folded_offset: +; CHECK: .functype store_narrowing_v8i16_with_folded_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 16 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <8 x i8>* %p to i32 + %r = add nuw i32 %q, 16 + %s = inttoptr i32 %r to <8 x i8>* + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_folded_gep_offset: ; CHECK: .functype store_v8i16_with_folded_gep_offset (v128, i32) -> () @@ -946,6 +985,25 @@ define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16_with_folded_gep_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_folded_gep_offset: +; CHECK: .functype store_narrowing_v8i16_with_folded_gep_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1 + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_unfolded_gep_negative_offset: ; CHECK: .functype store_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> () @@ -961,6 +1019,27 @@ define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i1 ret void } +define void @store_narrowing_v8i16_with_unfolded_gep_negative_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_gep_negative_offset: +; CHECK: .functype store_narrowing_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const -8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1 + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_unfolded_offset: ; CHECK: .functype store_v8i16_with_unfolded_offset (v128, i32) -> () @@ -978,6 +1057,29 @@ define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16_with_unfolded_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_offset: +; CHECK: .functype store_narrowing_v8i16_with_unfolded_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <8 x i8>* %p to i32 + %r = add nsw i32 %q, 16 + %s = inttoptr i32 %r to <8 x i8>* + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_unfolded_gep_offset: ; CHECK: .functype store_v8i16_with_unfolded_gep_offset (v128, i32) -> () @@ -993,6 +1095,27 @@ define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16_with_unfolded_gep_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_gep_offset: +; CHECK: .functype store_narrowing_v8i16_with_unfolded_gep_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1 + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_to_numeric_address(<8 x i16> %v) { ; CHECK-LABEL: store_v8i16_to_numeric_address: ; CHECK: .functype store_v8i16_to_numeric_address (v128) -> () @@ -1006,6 +1129,25 @@ define void @store_v8i16_to_numeric_address(<8 x i16> %v) { ret void } +define void @store_narrowing_v8i16_to_numeric_address(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_to_numeric_address: +; CHECK: .functype store_narrowing_v8i16_to_numeric_address (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 32 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 32 to <8 x i8>* + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_to_global_address(<8 x i16> %v) { ; CHECK-LABEL: store_v8i16_to_global_address: ; CHECK: .functype store_v8i16_to_global_address (v128) -> () @@ -1018,6 +1160,24 @@ define void @store_v8i16_to_global_address(<8 x i16> %v) { ret void } +define void @store_narrowing_v8i16_to_global_address(<8 x i8> %v) { +; CHECK-LABEL: store_narrowing_v8i16_to_global_address: +; CHECK: .functype store_narrowing_v8i16_to_global_address (v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store gv_v8i8 +; CHECK-NEXT: # fallthrough-return + store <8 x i8> %v , <8 x i8>* @gv_v8i8 + ret void +} + ; ============================================================================== ; 4 x i32 ; ============================================================================== @@ -1588,6 +1748,24 @@ define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32: +; CHECK: .functype store_narrowing_v4i32 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + store <4 x i16> %v , <4 x i16>* %p + ret void +} + define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_folded_offset: ; CHECK: .functype store_v4i32_with_folded_offset (v128, i32) -> () @@ -1603,6 +1781,27 @@ define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32_with_folded_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_folded_offset: +; CHECK: .functype store_narrowing_v4i32_with_folded_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 16 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x i16>* %p to i32 + %r = add nuw i32 %q, 16 + %s = inttoptr i32 %r to <4 x i16>* + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_folded_gep_offset: ; CHECK: .functype store_v4i32_with_folded_gep_offset (v128, i32) -> () @@ -1616,6 +1815,25 @@ define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32_with_folded_gep_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_folded_gep_offset: +; CHECK: .functype store_narrowing_v4i32_with_folded_gep_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1 + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_unfolded_gep_negative_offset: ; CHECK: .functype store_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> () @@ -1631,6 +1849,27 @@ define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i3 ret void } +define void @store_narrowing_v4i32_with_unfolded_gep_negative_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_gep_negative_offset: +; CHECK: .functype store_narrowing_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const -8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1 + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_unfolded_offset: ; CHECK: .functype store_v4i32_with_unfolded_offset (v128, i32) -> () @@ -1648,6 +1887,29 @@ define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32_with_unfolded_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_offset: +; CHECK: .functype store_narrowing_v4i32_with_unfolded_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x i16>* %p to i32 + %r = add nsw i32 %q, 16 + %s = inttoptr i32 %r to <4 x i16>* + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_unfolded_gep_offset: ; CHECK: .functype store_v4i32_with_unfolded_gep_offset (v128, i32) -> () @@ -1663,6 +1925,27 @@ define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32_with_unfolded_gep_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_gep_offset: +; CHECK: .functype store_narrowing_v4i32_with_unfolded_gep_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1 + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_to_numeric_address(<4 x i32> %v) { ; CHECK-LABEL: store_v4i32_to_numeric_address: ; CHECK: .functype store_v4i32_to_numeric_address (v128) -> () @@ -1676,6 +1959,25 @@ define void @store_v4i32_to_numeric_address(<4 x i32> %v) { ret void } +define void @store_narrowing_v4i32_to_numeric_address(<4 x i16> %v) { +; CHECK-LABEL: store_narrowing_v4i32_to_numeric_address: +; CHECK: .functype store_narrowing_v4i32_to_numeric_address (v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 32 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 32 to <4 x i16>* + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_to_global_address(<4 x i32> %v) { ; CHECK-LABEL: store_v4i32_to_global_address: ; CHECK: .functype store_v4i32_to_global_address (v128) -> () @@ -1688,6 +1990,24 @@ define void @store_v4i32_to_global_address(<4 x i32> %v) { ret void } +define void @store_narrowing_v4i32_to_global_address(<4 x i16> %v) { +; CHECK-LABEL: store_narrowing_v4i32_to_global_address: +; CHECK: .functype store_narrowing_v4i32_to_global_address (v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store gv_v4i16 +; CHECK-NEXT: # fallthrough-return + store <4 x i16> %v , <4 x i16>* @gv_v4i16 + ret void +} + ; ============================================================================== ; 2 x i64 ; ============================================================================== diff --git a/llvm/test/CodeGen/WebAssembly/simd-widening.ll b/llvm/test/CodeGen/WebAssembly/simd-widening.ll new file mode 100644 index 0000000000000..c9a7ffbbfcaf5 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-widening.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mattr=+simd128 | FileCheck %s + +;; Test that SIMD widening operations can be successfully selected + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +define <8 x i16> @widen_low_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: widen_low_i8x16_s: +; CHECK: .functype widen_low_i8x16_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.widen_low_i8x16_s +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = sext <8 x i8> %low to <8 x i16> + ret <8 x i16> %widened +} + +define <8 x i16> @widen_low_i8x16_u(<16 x i8> %v) { +; CHECK-LABEL: widen_low_i8x16_u: +; CHECK: .functype widen_low_i8x16_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.widen_low_i8x16_u +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = zext <8 x i8> %low to <8 x i16> + ret <8 x i16> %widened +} + +define <8 x i16> @widen_high_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: widen_high_i8x16_s: +; CHECK: .functype widen_high_i8x16_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.widen_high_i8x16_s +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = sext <8 x i8> %low to <8 x i16> + ret <8 x i16> %widened +} + +define <8 x i16> @widen_high_i8x16_u(<16 x i8> %v) { +; CHECK-LABEL: widen_high_i8x16_u: +; CHECK: .functype widen_high_i8x16_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.widen_high_i8x16_u +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = zext <8 x i8> %low to <8 x i16> + ret <8 x i16> %widened +} + +define <4 x i32> @widen_low_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: widen_low_i16x8_s: +; CHECK: .functype widen_low_i16x8_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.widen_low_i16x8_s +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = sext <4 x i16> %low to <4 x i32> + ret <4 x i32> %widened +} + +define <4 x i32> @widen_low_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: widen_low_i16x8_u: +; CHECK: .functype widen_low_i16x8_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.widen_low_i16x8_u +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = zext <4 x i16> %low to <4 x i32> + ret <4 x i32> %widened +} + +define <4 x i32> @widen_high_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: widen_high_i16x8_s: +; CHECK: .functype widen_high_i16x8_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.widen_high_i16x8_s +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = sext <4 x i16> %low to <4 x i32> + ret <4 x i32> %widened +} + +define <4 x i32> @widen_high_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: widen_high_i16x8_u: +; CHECK: .functype widen_high_i16x8_u (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.widen_high_i16x8_u +; CHECK-NEXT: # fallthrough-return + %low = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = zext <4 x i16> %low to <4 x i32> + ret <4 x i32> %widened +} + +;; Also test that similar patterns with offsets not corresponding to +;; the low or high half are correctly expanded. + +define <8 x i16> @widen_lowish_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: widen_lowish_i8x16_s: +; CHECK: .functype widen_lowish_i8x16_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 1 +; CHECK-NEXT: i16x8.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 2 +; CHECK-NEXT: i16x8.replace_lane 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 3 +; CHECK-NEXT: i16x8.replace_lane 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 4 +; CHECK-NEXT: i16x8.replace_lane 3 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 5 +; CHECK-NEXT: i16x8.replace_lane 4 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 6 +; CHECK-NEXT: i16x8.replace_lane 5 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 7 +; CHECK-NEXT: i16x8.replace_lane 6 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.extract_lane_u 8 +; CHECK-NEXT: i16x8.replace_lane 7 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i16x8.shl +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i16x8.shr_s +; CHECK-NEXT: # fallthrough-return + %lowish = shufflevector <16 x i8> %v, <16 x i8> undef, + <8 x i32> + %widened = sext <8 x i8> %lowish to <8 x i16> + ret <8 x i16> %widened +} + +define <4 x i32> @widen_lowish_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: widen_lowish_i16x8_s: +; CHECK: .functype widen_lowish_i16x8_s (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extract_lane_u 1 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extract_lane_u 2 +; CHECK-NEXT: i32x4.replace_lane 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extract_lane_u 3 +; CHECK-NEXT: i32x4.replace_lane 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.extract_lane_u 4 +; CHECK-NEXT: i32x4.replace_lane 3 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: # fallthrough-return + %lowish = shufflevector <8 x i16> %v, <8 x i16> undef, + <4 x i32> + %widened = sext <4 x i16> %lowish to <4 x i32> + ret <4 x i32> %widened +} diff --git a/llvm/test/CodeGen/WebAssembly/switch.ll b/llvm/test/CodeGen/WebAssembly/switch.ll index fbb59ddd10610..717ddcda6f321 100644 --- a/llvm/test/CodeGen/WebAssembly/switch.ll +++ b/llvm/test/CodeGen/WebAssembly/switch.ll @@ -1,11 +1,9 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs | FileCheck %s +; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs | FileCheck %s +; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs | FileCheck %s ; Test switch instructions. Block placement is disabled because it reorders ; the blocks in a way that isn't interesting here. -target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" -target triple = "wasm32-unknown-unknown" - declare void @foo0() declare void @foo1() declare void @foo2() @@ -14,28 +12,28 @@ declare void @foo4() declare void @foo5() ; CHECK-LABEL: bar32: -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: br_table {{[^,]+}}, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6{{$}} -; CHECK: .LBB{{[0-9]+}}_1: -; CHECK: call foo0{{$}} -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK: call foo1{{$}} -; CHECK: .LBB{{[0-9]+}}_3: -; CHECK: call foo2{{$}} -; CHECK: .LBB{{[0-9]+}}_4: -; CHECK: call foo3{{$}} -; CHECK: .LBB{{[0-9]+}}_5: -; CHECK: call foo4{{$}} -; CHECK: .LBB{{[0-9]+}}_6: -; CHECK: call foo5{{$}} -; CHECK: .LBB{{[0-9]+}}_7: -; CHECK: return{{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK-NEXT: br_table {{[^,]+}}, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6{{$}} +; CHECK: .LBB{{[0-9]+}}_1: +; CHECK: call foo0{{$}} +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: call foo1{{$}} +; CHECK: .LBB{{[0-9]+}}_3: +; CHECK: call foo2{{$}} +; CHECK: .LBB{{[0-9]+}}_4: +; CHECK: call foo3{{$}} +; CHECK: .LBB{{[0-9]+}}_5: +; CHECK: call foo4{{$}} +; CHECK: .LBB{{[0-9]+}}_6: +; CHECK: call foo5{{$}} +; CHECK: .LBB{{[0-9]+}}_7: +; CHECK: return{{$}} define void @bar32(i32 %n) { entry: switch i32 %n, label %sw.epilog [ @@ -94,32 +92,32 @@ sw.epilog: ; preds = %entry, %sw.bb.5, %s } ; CHECK-LABEL: bar64: -; CHECK: block {{$}} -; CHECK: i64.const -; CHECK: i64.gt_u -; CHECK: br_if 0 -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: block {{$}} -; CHECK: i32.wrap_i64 -; CHECK: br_table {{[^,]+}}, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 0{{$}} -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK: call foo0{{$}} -; CHECK: .LBB{{[0-9]+}}_3: -; CHECK: call foo1{{$}} -; CHECK: .LBB{{[0-9]+}}_4: -; CHECK: call foo2{{$}} -; CHECK: .LBB{{[0-9]+}}_5: -; CHECK: call foo3{{$}} -; CHECK: .LBB{{[0-9]+}}_6: -; CHECK: call foo4{{$}} -; CHECK: .LBB{{[0-9]+}}_7: -; CHECK: call foo5{{$}} -; CHECK: .LBB{{[0-9]+}}_8: -; CHECK: return{{$}} +; CHECK: block {{$}} +; CHECK: i64.const +; CHECK: i64.gt_u +; CHECK: br_if 0 +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: block {{$}} +; CHECK: i32.wrap_i64 +; CHECK-NEXT: br_table {{[^,]+}}, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 0{{$}} +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: call foo0{{$}} +; CHECK: .LBB{{[0-9]+}}_3: +; CHECK: call foo1{{$}} +; CHECK: .LBB{{[0-9]+}}_4: +; CHECK: call foo2{{$}} +; CHECK: .LBB{{[0-9]+}}_5: +; CHECK: call foo3{{$}} +; CHECK: .LBB{{[0-9]+}}_6: +; CHECK: call foo4{{$}} +; CHECK: .LBB{{[0-9]+}}_7: +; CHECK: call foo5{{$}} +; CHECK: .LBB{{[0-9]+}}_8: +; CHECK: return{{$}} define void @bar64(i64 %n) { entry: switch i64 %n, label %sw.epilog [ @@ -178,24 +176,24 @@ sw.epilog: ; preds = %entry, %sw.bb.5, %s } ; CHECK-LABEL: truncated: -; CHECK: block -; CHECK: block -; CHECK: block -; CHECK: i32.wrap_i64 -; CHECK: br_table {{[^,]+}}, 0, 1, 2{{$}} -; CHECK: .LBB{{[0-9]+}}_1 -; CHECK: end_block -; CHECK: call foo0{{$}} -; CHECK: return{{$}} -; CHECK: .LBB{{[0-9]+}}_2 -; CHECK: end_block -; CHECK: call foo1{{$}} -; CHECK: return{{$}} -; CHECK: .LBB{{[0-9]+}}_3 -; CHECK: end_block -; CHECK: call foo2{{$}} -; CHECK: return{{$}} -; CHECK: end_function +; CHECK: block +; CHECK: block +; CHECK: block +; CHECK: i32.wrap_i64 +; CHECK-NEXT: br_table {{[^,]+}}, 0, 1, 2{{$}} +; CHECK: .LBB{{[0-9]+}}_1 +; CHECK: end_block +; CHECK: call foo0{{$}} +; CHECK: return{{$}} +; CHECK: .LBB{{[0-9]+}}_2 +; CHECK: end_block +; CHECK: call foo1{{$}} +; CHECK: return{{$}} +; CHECK: .LBB{{[0-9]+}}_3 +; CHECK: end_block +; CHECK: call foo2{{$}} +; CHECK: return{{$}} +; CHECK: end_function define void @truncated(i64 %n) { entry: %m = trunc i64 %n to i32 diff --git a/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll b/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll index 53b710324292d..027e9b5a86f43 100644 --- a/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll +++ b/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll @@ -8,7 +8,7 @@ define float @test(<4 x float>* %A) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movaps (%eax), %xmm0 -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; X86-NEXT: xorps %xmm1, %xmm1 ; X86-NEXT: movaps %xmm1, (%eax) ; X86-NEXT: movss %xmm0, (%esp) @@ -19,7 +19,7 @@ define float @test(<4 x float>* %A) nounwind { ; X64-LABEL: test: ; X64: # %bb.0: # %entry ; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: movaps %xmm1, (%rdi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll new file mode 100644 index 0000000000000..119821e91b38c --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=x86_64-linux-gnu < %s | FileCheck %s + +; FIXME: Calling convention lowering fails +; define half @roundeven_f16(half %x) { +; %roundeven = call half @llvm.roundeven.f16(half %x) +; ret half %roundeven +; } + +define float @roundeven_f32(float %x) { +; CHECK-LABEL: roundeven_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq roundevenf +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %roundeven = call float @llvm.roundeven.f32(float %x) + ret float %roundeven +} + +define double @roundeven_f64(double %x) { +; CHECK-LABEL: roundeven_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq roundeven +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %roundeven = call double @llvm.roundeven.f64(double %x) + ret double %roundeven +} + +; FIXME: Insert fails +; define x86_fp80 @roundeven_fp80(x86_fp80 %x) { +; %roundeven = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %x) +; ret x86_fp80 %roundeven +; } + +define fp128 @roundeven_f128(fp128 %x) { +; CHECK-LABEL: roundeven_f128: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq roundevenl +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %roundeven = call fp128 @llvm.roundeven.f128(fp128 %x) + ret fp128 %roundeven +} + +; FIXME: Fails on build_vector +; define <4 x float> @roundeven_v4f32(<4 x float> %x) { +; %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) +; ret <4 x float> %roundeven +; } + +declare half @llvm.roundeven.f16(half) #0 +declare float @llvm.roundeven.f32(float) #0 +declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) #0 +declare double @llvm.roundeven.f64(double) #0 +declare x86_fp80 @llvm.roundeven.f80(x86_fp80) #0 +declare fp128 @llvm.roundeven.f128(fp128) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll index 17b58a581db1c..7cd38af00079f 100644 --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -113,7 +113,7 @@ define void @fadd_64r(double* %loc, double %val) nounwind { ; X86-SSE1-NEXT: xorps %xmm1, %xmm1 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: faddl 12(%ebp) @@ -278,7 +278,7 @@ define void @fadd_64g() nounwind { ; X86-SSE1-NEXT: xorps %xmm1, %xmm1 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl (%esp) @@ -441,7 +441,7 @@ define void @fadd_64imm() nounwind { ; X86-SSE1-NEXT: xorps %xmm1, %xmm1 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl (%esp) @@ -610,7 +610,7 @@ define void @fadd_64stack() nounwind { ; X86-SSE1-NEXT: xorps %xmm1, %xmm1 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: faddl (%esp) @@ -716,7 +716,7 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE1-NEXT: xorps %xmm1, %xmm1 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: faddl 12(%ebp) diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll index 664f195e3884a..a963865136054 100644 --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -278,7 +278,7 @@ define double @load_double(double* %fptr) { ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movss %xmm0, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: addl $12, %esp @@ -665,7 +665,7 @@ define double @load_double_seq_cst(double* %fptr) { ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 ; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movss %xmm0, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: addl $12, %esp diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index d2638a1681e85..8cd4b2f2571f3 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -160,30 +160,30 @@ define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vmovdqa (%rsi), %xmm6 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero @@ -454,27 +454,27 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero @@ -482,35 +482,35 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero @@ -2386,12 +2386,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] @@ -2402,7 +2402,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm5, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] ; SSE2-NEXT: movupd %xmm1, (%rax) @@ -2558,7 +2558,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll index 653a88edd26a0..b3fa8ac9aeeda 100644 --- a/llvm/test/CodeGen/X86/avx-cvt.ll +++ b/llvm/test/CodeGen/X86/avx-cvt.ll @@ -33,7 +33,7 @@ define <8 x float> @sitofp02(<8 x i16> %a) { ; AVX-LABEL: sitofp02: ; AVX: # %bb.0: ; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 746b068cec2fb..35c449e813c09 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1895,7 +1895,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovd %eax, %xmm0 -; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X86-NEXT: retl @@ -1903,7 +1903,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { ; X64-LABEL: test_mm256_set1_epi16: ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll index 26fefb26fff41..3755cf4740abe 100644 --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64 @@ -16,7 +16,7 @@ entry: define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { ; CHECK-LABEL: funcB: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} @@ -134,7 +134,7 @@ define <8 x float> @funcF(i32 %val) nounwind { ; X64-LABEL: funcF: ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6 diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index 609c02eee0902..89baa50bf0a40 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -624,12 +624,12 @@ entry: define <4 x i32> @H(<4 x i32> %a) { ; X32-LABEL: H: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X32-NEXT: retl ; ; X64-LABEL: H: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: retq entry: %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll index 89f74a45e6b7b..baef0fb8567bd 100644 --- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll +++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll @@ -29,7 +29,7 @@ define void @insert_crash() nounwind { ; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: vmovups %xmm0, (%rax) ; CHECK-NEXT: retq allocas: diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll index 26a5cd328d5c7..a519f55aaafe5 100644 --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -265,7 +265,7 @@ define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounw ; ; AVX2-LABEL: shuffle_v8f32_uu67uu67: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll index 4c257a0edf9d7..99ebd4baffcca 100644 --- a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll +++ b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll @@ -8,7 +8,7 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) { ; KNL: # %bb.0: ; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; KNL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; KNL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index e92528b9bbcc8..64f281b7ffdf1 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -217,7 +217,7 @@ define <4 x i64> @f64to4sl(<4 x double> %a) { define <4 x i64> @f32to4sl(<4 x float> %a) { ; NODQ-LABEL: f32to4sl: ; NODQ: # %bb.0: -; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; NODQ-NEXT: vcvttss2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm1 ; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll index 1fd3e15c3e019..9e6c8b8becbc9 100644 --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -5,18 +5,18 @@ define i32 @hadd_16(<16 x i32> %x225) { ; KNL-LABEL: hadd_16: ; KNL: # %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: hadd_16: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper @@ -32,18 +32,18 @@ define i32 @hadd_16(<16 x i32> %x225) { define i32 @hsub_16(<16 x i32> %x225) { ; KNL-LABEL: hsub_16: ; KNL: # %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: hsub_16: ; SKX: # %bb.0: -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index f6ffd6419c13a..41bdaf21baa38 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -1692,16 +1692,15 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) { ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm2 -; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: andl $63, %esi ; KNL-NEXT: testb %dil, %dil -; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm2, (%rsp) +; KNL-NEXT: vmovdqa64 %zmm0, (%rsp) ; KNL-NEXT: setne (%rsp,%rsi) ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 @@ -1772,116 +1771,115 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) { ; KNL-NEXT: subq $192, %rsp ; KNL-NEXT: movl 744(%rbp), %eax ; KNL-NEXT: andl $127, %eax -; KNL-NEXT: vmovd %edi, %xmm0 -; KNL-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $4, %r8d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0 ; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2 ; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3 ; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; KNL-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm2 -; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 -; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm4, %xmm4 -; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; KNL-NEXT: vpcmpeqb %ymm1, %ymm3, %ymm1 +; KNL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm2 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: cmpb $0, 736(%rbp) -; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm0, (%rsp) +; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa64 %zmm1, (%rsp) ; KNL-NEXT: setne (%rsp,%rax) ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2079,23 +2077,21 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $192, %rsp ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi -; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 -; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm4 -; KNL-NEXT: vpternlogq $15, %zmm4, %zmm4, %zmm4 -; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; KNL-NEXT: andl $127, %esi ; KNL-NEXT: testb %dil, %dil -; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm4, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; KNL-NEXT: vmovdqa %ymm3, (%rsp) +; KNL-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa64 %zmm0, (%rsp) ; KNL-NEXT: setne (%rsp,%rsi) ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index f115f9a6ef382..2bc798c06b777 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6550,7 +6550,7 @@ define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6563,7 +6563,7 @@ define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6602,7 +6602,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6636,7 +6636,7 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6668,7 +6668,7 @@ define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6681,7 +6681,7 @@ define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6706,7 +6706,7 @@ define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6719,7 +6719,7 @@ define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6747,7 +6747,7 @@ define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6762,7 +6762,7 @@ define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6807,7 +6807,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6844,7 +6844,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 @@ -6882,7 +6882,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6898,7 +6898,7 @@ define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6928,7 +6928,7 @@ define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -6943,7 +6943,7 @@ define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -6972,7 +6972,7 @@ define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) { ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper @@ -7006,7 +7006,7 @@ define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) { ; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper @@ -7040,7 +7040,7 @@ define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) { ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper @@ -7070,7 +7070,7 @@ define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) { ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper @@ -7103,7 +7103,7 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vzeroupper @@ -7119,7 +7119,7 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper @@ -7161,7 +7161,7 @@ define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vzeroupper @@ -7178,7 +7178,7 @@ define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper @@ -7220,7 +7220,7 @@ define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vzeroupper @@ -7237,7 +7237,7 @@ define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper @@ -7274,7 +7274,7 @@ define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vzeroupper @@ -7290,7 +7290,7 @@ define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll index 88a3b5aea9bd4..24e58149eb4c2 100644 --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -887,35 +887,157 @@ define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) } define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) { -; KNL-LABEL: ternlog_or_and_mask: +; ALL-LABEL: ternlog_or_and_mask: +; ALL: ## %bb.0: +; ALL-NEXT: vpternlogd $236, {{.*}}(%rip), %zmm1, %zmm0 +; ALL-NEXT: retq + %a = and <16 x i32> %x, + %b = or <16 x i32> %a, %y + ret <16 x i32> %b +} + +define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) { +; ALL-LABEL: ternlog_xor_and_mask: +; ALL: ## %bb.0: +; ALL-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm1, %zmm0 +; ALL-NEXT: retq + %a = and <8 x i64> %x, + %b = xor <8 x i64> %a, %y + ret <8 x i64> %b +} + +define <16 x i32> @ternlog_maskz_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x i32> %mask) { +; KNL-LABEL: ternlog_maskz_or_and_mask: ; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %zmm2, %zmm3, %k1 ; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; -; SKX-LABEL: ternlog_or_and_mask: +; SKX-LABEL: ternlog_maskz_or_and_mask: ; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %zmm2, %k1 ; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq + %m = icmp slt <16 x i32> %mask, zeroinitializer %a = and <16 x i32> %x, %b = or <16 x i32> %a, %y - ret <16 x i32> %b + %c = select <16 x i1> %m, <16 x i32> %b, <16 x i32> zeroinitializer + ret <16 x i32> %c } -define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) { -; KNL-LABEL: ternlog_xor_and_mask: +define <8 x i64> @ternlog_maskz_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i64> %mask) { +; KNL-LABEL: ternlog_maskz_xor_and_mask: ; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %zmm2, %zmm3, %k1 ; KNL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 -; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %zmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <8 x i64> %mask, zeroinitializer + %a = and <8 x i64> %x, + %b = xor <8 x i64> %a, %y + %c = select <8 x i1> %m, <8 x i64> %b, <8 x i64> zeroinitializer + ret <8 x i64> %c +} + +define <16 x i32> @ternlog_maskx_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x i32> %mask) { +; KNL-LABEL: ternlog_maskx_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm2 +; KNL-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %zmm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm2 +; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <16 x i32> %mask, zeroinitializer + %a = and <16 x i32> %x, + %b = or <16 x i32> %a, %y + %c = select <16 x i1> %m, <16 x i32> %b, <16 x i32> %x + ret <16 x i32> %c +} + +define <16 x i32> @ternlog_masky_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x i32> %mask) { +; KNL-LABEL: ternlog_masky_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpord %zmm1, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL-NEXT: retq ; -; SKX-LABEL: ternlog_xor_and_mask: +; SKX-LABEL: ternlog_masky_or_and_mask: ; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %zmm2, %k1 ; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vorps %zmm1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq + %m = icmp slt <16 x i32> %mask, zeroinitializer + %a = and <16 x i32> %x, + %b = or <16 x i32> %a, %y + %c = select <16 x i1> %m, <16 x i32> %b, <16 x i32> %y + ret <16 x i32> %c +} + +define <8 x i64> @ternlog_maskx_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i64> %mask) { +; KNL-LABEL: ternlog_maskx_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm2 +; KNL-NEXT: vpxorq %zmm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %zmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm2 +; SKX-NEXT: vxorpd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <8 x i64> %mask, zeroinitializer %a = and <8 x i64> %x, %b = xor <8 x i64> %a, %y - ret <8 x i64> %b + %c = select <8 x i1> %m, <8 x i64> %b, <8 x i64> %x + ret <8 x i64> %c +} + +define <8 x i64> @ternlog_masky_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i64> %mask) { +; KNL-LABEL: ternlog_masky_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %zmm2, %zmm3, %k1 +; KNL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm1 {%k1} +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %zmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq + %m = icmp slt <8 x i64> %mask, zeroinitializer + %a = and <8 x i64> %x, + %b = xor <8 x i64> %a, %y + %c = select <8 x i1> %m, <8 x i64> %b, <8 x i64> %y + ret <8 x i64> %c } diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 24b4c322c54b8..050269c2ba2f3 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s ; FIXME: All cases here should be fixed by PR34380 @@ -2143,11 +2143,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,3,4,2,0,3,4] -; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4] +; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2158,10 +2158,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -3426,7 +3427,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <4,14,4,14,u,u,u,u> +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [60129542148,60129542148,60129542148,60129542148] ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -3444,7 +3445,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <4,14,4,14,u,u,u,u> +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [60129542148,60129542148,60129542148,60129542148] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll index 26d905ebeae77..13c4c8afb9a8f 100644 --- a/llvm/test/CodeGen/X86/avx512vl-logic.ll +++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll @@ -991,8 +991,7 @@ define <4 x i32> @ternlog_xor_andn(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: ternlog_or_and_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpternlogd $236, {{.*}}(%rip), %xmm1, %xmm0 ; CHECK-NEXT: retq %a = and <4 x i32> %x, %b = or <4 x i32> %a, %y @@ -1002,8 +1001,7 @@ define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) { define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: ternlog_or_and_mask_ymm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpternlogd $236, {{.*}}(%rip), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = and <8 x i32> %x, %b = or <8 x i32> %a, %y @@ -1013,8 +1011,7 @@ define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) { define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: ternlog_xor_and_mask: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 ; CHECK-NEXT: retq %a = and <2 x i64> %x, %b = xor <2 x i64> %a, %y @@ -1024,10 +1021,279 @@ define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) { define <4 x i64> @ternlog_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y) { ; CHECK-LABEL: ternlog_xor_and_mask_ymm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpternlogq $108, {{.*}}(%rip), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = and <4 x i64> %x, %b = xor <4 x i64> %a, %y ret <4 x i64> %b } + +define <4 x i32> @ternlog_maskz_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %mask) { +; KNL-LABEL: ternlog_maskz_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %xmm3, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <4 x i32> %mask, zeroinitializer + %a = and <4 x i32> %x, + %b = or <4 x i32> %a, %y + %c = select <4 x i1> %m, <4 x i32> %b, <4 x i32> zeroinitializer + ret <4 x i32> %c +} + +define <8 x i32> @ternlog_maskz_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) { +; KNL-LABEL: ternlog_maskz_or_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_or_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %ymm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <8 x i32> %mask, zeroinitializer + %a = and <8 x i32> %x, + %b = or <8 x i32> %a, %y + %c = select <8 x i1> %m, <8 x i32> %b, <8 x i32> zeroinitializer + ret <8 x i32> %c +} + +define <2 x i64> @ternlog_maskz_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) { +; KNL-LABEL: ternlog_maskz_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpxorq %xmm1, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %xmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <2 x i64> %mask, zeroinitializer + %a = and <2 x i64> %x, + %b = xor <2 x i64> %a, %y + %c = select <2 x i1> %m, <2 x i64> %b, <2 x i64> zeroinitializer + ret <2 x i64> %c +} + +define <4 x i64> @ternlog_maskz_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) { +; KNL-LABEL: ternlog_maskz_xor_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxorq %ymm1, %ymm0, %ymm0 {%k1} {z} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskz_xor_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %ymm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq + %m = icmp slt <4 x i64> %mask, zeroinitializer + %a = and <4 x i64> %x, + %b = xor <4 x i64> %a, %y + %c = select <4 x i1> %m, <4 x i64> %b, <4 x i64> zeroinitializer + ret <4 x i64> %c +} + +define <4 x i32> @ternlog_maskx_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %mask) { +; KNL-LABEL: ternlog_maskx_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; KNL-NEXT: vpord %xmm1, %xmm2, %xmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %xmm3, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm2 +; SKX-NEXT: vorps %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <4 x i32> %mask, zeroinitializer + %a = and <4 x i32> %x, + %b = or <4 x i32> %a, %y + %c = select <4 x i1> %m, <4 x i32> %b, <4 x i32> %x + ret <4 x i32> %c +} + +define <8 x i32> @ternlog_maskx_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) { +; KNL-LABEL: ternlog_maskx_or_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_or_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %ymm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <8 x i32> %mask, zeroinitializer + %a = and <8 x i32> %x, + %b = or <8 x i32> %a, %y + %c = select <8 x i1> %m, <8 x i32> %b, <8 x i32> %x + ret <8 x i32> %c +} + +define <2 x i64> @ternlog_maskx_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) { +; KNL-LABEL: ternlog_maskx_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; KNL-NEXT: vpxorq %xmm1, %xmm2, %xmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %xmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm2 +; SKX-NEXT: vxorpd %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <2 x i64> %mask, zeroinitializer + %a = and <2 x i64> %x, + %b = xor <2 x i64> %a, %y + %c = select <2 x i1> %m, <2 x i64> %b, <2 x i64> %x + ret <2 x i64> %c +} + +define <4 x i64> @ternlog_maskx_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) { +; KNL-LABEL: ternlog_maskx_xor_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpxorq %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_maskx_xor_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %ymm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vxorpd %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <4 x i64> %mask, zeroinitializer + %a = and <4 x i64> %x, + %b = xor <4 x i64> %a, %y + %c = select <4 x i1> %m, <4 x i64> %b, <4 x i64> %x + ret <4 x i64> %c +} + +define <4 x i32> @ternlog_masky_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %mask) { +; KNL-LABEL: ternlog_masky_or_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpgtd %xmm3, %xmm2, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpord %xmm1, %xmm0, %xmm1 {%k1} +; KNL-NEXT: vmovdqa %xmm1, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_or_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %xmm3, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vorps %xmm1, %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: retq + %m = icmp slt <4 x i32> %mask, zeroinitializer + %a = and <4 x i32> %x, + %b = or <4 x i32> %a, %y + %c = select <4 x i1> %m, <4 x i32> %b, <4 x i32> %y + ret <4 x i32> %c +} + +define <8 x i32> @ternlog_masky_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) { +; KNL-LABEL: ternlog_masky_or_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; KNL-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_or_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovd2m %ymm2, %k1 +; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: retq + %m = icmp slt <8 x i32> %mask, zeroinitializer + %a = and <8 x i32> %x, + %b = or <8 x i32> %a, %y + %c = select <8 x i1> %m, <8 x i32> %b, <8 x i32> %x + ret <8 x i32> %c +} + +define <2 x i64> @ternlog_masky_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) { +; KNL-LABEL: ternlog_masky_xor_and_mask: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpxorq %xmm1, %xmm0, %xmm1 {%k1} +; KNL-NEXT: vmovdqa %xmm1, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_xor_and_mask: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %xmm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: retq + %m = icmp slt <2 x i64> %mask, zeroinitializer + %a = and <2 x i64> %x, + %b = xor <2 x i64> %a, %y + %c = select <2 x i1> %m, <2 x i64> %b, <2 x i64> %y + ret <2 x i64> %c +} + +define <4 x i64> @ternlog_masky_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) { +; KNL-LABEL: ternlog_masky_xor_and_mask_ymm: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxorq %ymm1, %ymm0, %ymm1 {%k1} +; KNL-NEXT: vmovdqa %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: ternlog_masky_xor_and_mask_ymm: +; SKX: ## %bb.0: +; SKX-NEXT: vpmovq2m %ymm2, %k1 +; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: retq + %m = icmp slt <4 x i64> %mask, zeroinitializer + %a = and <4 x i64> %x, + %b = xor <4 x i64> %a, %y + %c = select <4 x i1> %m, <4 x i64> %b, <4 x i64> %y + ret <4 x i64> %c +} diff --git a/llvm/test/CodeGen/X86/basicblock-sections-clusters-branches.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-branches.ll similarity index 93% rename from llvm/test/CodeGen/X86/basicblock-sections-clusters-branches.ll rename to llvm/test/CodeGen/X86/basic-block-sections-clusters-branches.ll index a6f297392d94d..ec90c279e6ffa 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-clusters-branches.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-branches.ll @@ -7,7 +7,7 @@ ; RUN: echo '!foo' > %t1 ; RUN: echo '!!0 2' >> %t1 ; RUN: echo '!!1' >> %t1 -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t1 | FileCheck %s -check-prefix=LINUX-SECTIONS1 +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 | FileCheck %s -check-prefix=LINUX-SECTIONS1 ; ; Test2: Basic blocks #1 and #3 will be placed in the same section. ; The rest (#0 and #2) go into the function's section. @@ -15,7 +15,7 @@ ; #2 must have an explicit jump to #3. ; RUN: echo '!foo' > %t2 ; RUN: echo '!!1 3' >> %t2 -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t2 | FileCheck %s -check-prefix=LINUX-SECTIONS2 +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 | FileCheck %s -check-prefix=LINUX-SECTIONS2 define void @foo(i1 zeroext) nounwind { %2 = alloca i8, align 1 diff --git a/llvm/test/CodeGen/X86/basicblock-sections-clusters-eh.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-eh.ll similarity index 95% rename from llvm/test/CodeGen/X86/basicblock-sections-clusters-eh.ll rename to llvm/test/CodeGen/X86/basic-block-sections-clusters-eh.ll index 60f62d4c152ff..4e80c72f11036 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-clusters-eh.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-eh.ll @@ -5,14 +5,14 @@ ; The rest will be placed in a section along with the entry basic block. ; RUN: echo '!main' > %t1 ; RUN: echo '!!1 2' >> %t1 -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t1 | FileCheck %s -check-prefix=LINUX-SECTIONS1 +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 | FileCheck %s -check-prefix=LINUX-SECTIONS1 ; ; Test2: Basic blocks #1, #2, and #3 go into a separate section. ; No separate exception section will be created as #1 and #3 are already in one section. ; The rest will be placed in a section along with the entry basic block. ; RUN: echo '!main' > %t2 ; RUN: echo '!!1 2 3' >> %t2 -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t2 | FileCheck %s -check-prefix=LINUX-SECTIONS2 +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 | FileCheck %s -check-prefix=LINUX-SECTIONS2 @_ZTIi = external constant i8* diff --git a/llvm/test/CodeGen/X86/basicblock-sections-clusters-error.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll similarity index 75% rename from llvm/test/CodeGen/X86/basicblock-sections-clusters-error.ll rename to llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll index 29946d0f5b4bf..28c79e28fc038 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-clusters-error.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll @@ -2,19 +2,19 @@ ; RUN: echo '!f' > %t1 ; RUN: echo '!!1 4' >> %t1 ; RUN: echo '!!1' >> %t1 -; RUN: not --crash llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t1 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR1 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR1 ; CHECK-ERROR1: LLVM ERROR: Invalid profile {{.*}} at line 3: Duplicate basic block id found '1'. ; RUN: echo '!f' > %t2 ; RUN: echo '!!4 0' >> %t2 -; RUN: not --crash llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR2 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR2 ; CHECK-ERROR2: LLVM ERROR: Invalid profile {{.*}} at line 2: Entry BB (0) does not begin a cluster. ; RUN: echo '!f' > %t3 ; RUN: echo '!!-1' >> %t3 -; RUN: not --crash llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR3 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR3 ; CHECK-ERROR3: LLVM ERROR: Invalid profile {{.*}} at line 2: Unsigned integer expected: '-1'. ; RUN: echo '!!1' > %t4 ; RUN: echo '!f' >> %t4 -; RUN: not --crash llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR4 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR4 ; CHECK-ERROR4: LLVM ERROR: Invalid profile {{.*}} at line 1: Cluster list does not follow a function name specifier. define i32 @dummy(i32 %x, i32 %y, i32 %z) { diff --git a/llvm/test/CodeGen/X86/basicblock-sections-clusters.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters.ll similarity index 94% rename from llvm/test/CodeGen/X86/basicblock-sections-clusters.ll rename to llvm/test/CodeGen/X86/basic-block-sections-clusters.ll index 2c40542f6808f..12759496fddd3 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-clusters.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters.ll @@ -6,13 +6,13 @@ ; RUN: echo '!foo' > %t1 ; RUN: echo '!!0 2' >> %t1 ; RUN: echo '!!1' >> %t1 -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t1 | FileCheck %s -check-prefix=LINUX-SECTIONS1 +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 | FileCheck %s -check-prefix=LINUX-SECTIONS1 ; ; Test2: Basic blocks #1 and #3 will be placed in the same section. ; All other BBs (including the entry block) go into the function's section. ; RUN: echo '!foo' > %t2 ; RUN: echo '!!1 3' >> %t2 -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t2 | FileCheck %s -check-prefix=LINUX-SECTIONS2 +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 | FileCheck %s -check-prefix=LINUX-SECTIONS2 define void @foo(i1 zeroext) nounwind { %2 = alloca i8, align 1 diff --git a/llvm/test/CodeGen/X86/basicblock-sections-cold.ll b/llvm/test/CodeGen/X86/basic-block-sections-cold.ll similarity index 93% rename from llvm/test/CodeGen/X86/basicblock-sections-cold.ll rename to llvm/test/CodeGen/X86/basic-block-sections-cold.ll index c7282a1e57368..432c0952a4a08 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-cold.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-cold.ll @@ -2,7 +2,7 @@ ; Basic block with id 1 and 2 must be in the cold section. ; RUN: echo '!_Z3bazb' > %t ; RUN: echo '!!0' >> %t -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t -unique-bb-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS define void @_Z3bazb(i1 zeroext) nounwind { %2 = alloca i8, align 1 diff --git a/llvm/test/CodeGen/X86/basicblock-sections-directjumps.ll b/llvm/test/CodeGen/X86/basic-block-sections-directjumps.ll similarity index 84% rename from llvm/test/CodeGen/X86/basicblock-sections-directjumps.ll rename to llvm/test/CodeGen/X86/basic-block-sections-directjumps.ll index 8604b129b54cf..99a64ef130319 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-directjumps.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-directjumps.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=all -unique-bb-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -function-sections -basicblock-sections=all -unique-bb-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=all -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -function-sections -basic-block-sections=all -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS define void @_Z3bazb(i1 zeroext) { %2 = alloca i8, align 1 diff --git a/llvm/test/CodeGen/X86/basicblock-sections-eh.ll b/llvm/test/CodeGen/X86/basic-block-sections-eh.ll similarity index 96% rename from llvm/test/CodeGen/X86/basicblock-sections-eh.ll rename to llvm/test/CodeGen/X86/basic-block-sections-eh.ll index 7e5f4a2fe3927..3cc5979d7c0c0 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-eh.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-eh.ll @@ -1,5 +1,5 @@ ; Check if landing pads are kept in a separate eh section -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -function-sections -basicblock-sections=all -unique-bb-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -function-sections -basic-block-sections=all -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS @_ZTIb = external constant i8* define i32 @_Z3foob(i1 zeroext %0) #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { diff --git a/llvm/test/CodeGen/X86/basicblock-sections-labels.ll b/llvm/test/CodeGen/X86/basic-block-sections-labels.ll similarity index 92% rename from llvm/test/CodeGen/X86/basicblock-sections-labels.ll rename to llvm/test/CodeGen/X86/basic-block-sections-labels.ll index 2f077e6866924..80aaf79c115a4 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-labels.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-labels.ll @@ -1,5 +1,5 @@ ; Check the basic block sections labels option -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=labels | FileCheck %s -check-prefix=LINUX-LABELS +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=labels | FileCheck %s -check-prefix=LINUX-LABELS define void @_Z3bazb(i1 zeroext) { %2 = alloca i8, align 1 diff --git a/llvm/test/CodeGen/X86/basicblock-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll similarity index 95% rename from llvm/test/CodeGen/X86/basicblock-sections-list.ll rename to llvm/test/CodeGen/X86/basic-block-sections-list.ll index 9a5056af39ed0..7c48628f73a8c 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-list.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll @@ -1,6 +1,6 @@ ; Check the basic block sections list option. ; RUN: echo '!_Z3foob' > %t -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t -unique-bb-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS define i32 @_Z3foob(i1 zeroext %0) nounwind { %2 = alloca i32, align 4 diff --git a/llvm/test/CodeGen/X86/basicblock-sections-listbb.ll b/llvm/test/CodeGen/X86/basic-block-sections-listbb.ll similarity index 93% rename from llvm/test/CodeGen/X86/basicblock-sections-listbb.ll rename to llvm/test/CodeGen/X86/basic-block-sections-listbb.ll index ac17a461d7af9..ab729ecf9eb24 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-listbb.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-listbb.ll @@ -2,7 +2,7 @@ ; Only basic block with id 2 must get a section. ; RUN: echo '!_Z3bazb' > %t ; RUN: echo '!!2' >> %t -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t -unique-bb-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS define void @_Z3bazb(i1 zeroext) nounwind { %2 = alloca i8, align 1 diff --git a/llvm/test/CodeGen/X86/basicblock-sections-mir-parse.mir b/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir similarity index 98% rename from llvm/test/CodeGen/X86/basicblock-sections-mir-parse.mir rename to llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir index 6011342a6f003..a4219fa1509d5 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-mir-parse.mir +++ b/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir @@ -10,7 +10,7 @@ # } # # clang -O0 -S -emit-llvm foo.cc -# llc < foo.ll -stop-after=bbsections-prepare -basicblock-sections=all +# llc < foo.ll -stop-after=bbsections-prepare -basic-block-sections=all --- | ; Function Attrs: noinline nounwind optnone uwtable diff --git a/llvm/test/CodeGen/X86/basicblock-sections-mir-print.ll b/llvm/test/CodeGen/X86/basic-block-sections-mir-print.ll similarity index 91% rename from llvm/test/CodeGen/X86/basicblock-sections-mir-print.ll rename to llvm/test/CodeGen/X86/basic-block-sections-mir-print.ll index bd66f44ecc6db..7b3c7246971da 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections-mir-print.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-mir-print.ll @@ -2,7 +2,7 @@ ; RUN: echo '!_Z3foob' > %t ; RUN: echo '!!1' >> %t ; RUN: echo '!!2' >> %t -; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=%t -stop-after=bbsections-prepare | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t -stop-after=bbsections-prepare | FileCheck %s -check-prefix=CHECK @_ZTIb = external constant i8* define dso_local i32 @_Z3foob(i1 zeroext %0) { diff --git a/llvm/test/CodeGen/X86/basicblock-sections.ll b/llvm/test/CodeGen/X86/basic-block-sections.ll similarity index 84% rename from llvm/test/CodeGen/X86/basicblock-sections.ll rename to llvm/test/CodeGen/X86/basic-block-sections.ll index 5c17b755fa70b..d996f5e9f5397 100644 --- a/llvm/test/CodeGen/X86/basicblock-sections.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basicblock-sections=all -unique-bb-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -function-sections -basicblock-sections=all -unique-bb-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=all -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -function-sections -basic-block-sections=all -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS define void @_Z3bazb(i1 zeroext) nounwind { %2 = alloca i8, align 1 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index f3805163b691a..75e3ab7639a95 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -94,7 +94,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; SSE2-SSSE3-LABEL: ext_i8_8i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 @@ -104,7 +104,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1-LABEL: ext_i8_8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -293,7 +293,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; SSE2-SSSE3-LABEL: ext_i16_16i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 @@ -307,7 +307,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1-LABEL: ext_i16_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 @@ -545,7 +545,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-LABEL: ext_i32_32i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm2 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 @@ -554,7 +554,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768] ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 @@ -566,7 +566,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-LABEL: ext_i32_32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] @@ -579,7 +579,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 20135e8ca29b3..e7fcb50682464 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -119,7 +119,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; SSE2-SSSE3-LABEL: ext_i8_8i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 @@ -130,7 +130,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1-LABEL: ext_i8_8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -373,7 +373,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; SSE2-SSSE3-LABEL: ext_i16_16i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 @@ -389,7 +389,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1-LABEL: ext_i16_16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 @@ -700,7 +700,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-LABEL: ext_i32_32i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm2 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 @@ -711,7 +711,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: psrlw $15, %xmm1 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 @@ -725,7 +725,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-LABEL: ext_i32_32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] @@ -740,7 +740,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index ac2fd448452d4..65bf43ff2ed9e 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -93,7 +93,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; SSE2-SSSE3-LABEL: bitcast_i8_8i1: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 @@ -104,7 +104,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; AVX1-LABEL: bitcast_i8_8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector.ll index e319255e8f0c3..995532cb5b9d7 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector.ll @@ -19,7 +19,7 @@ define i1 @foo(i64 %a) { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE-NEXT: movaps %xmm0, %xmm1 -; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 ; X86-SSE-NEXT: setp %al ; X86-SSE-NEXT: retl @@ -27,7 +27,7 @@ define i1 @foo(i64 %a) { ; X64-LABEL: foo: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-NEXT: ucomiss %xmm1, %xmm0 ; X64-NEXT: setp %al ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index faccb8158d322..7c61f1cac2d8d 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -515,7 +515,7 @@ define i32 @bitcast_v64i8_to_v2i32(<64 x i8> %a0) nounwind { ; SSE2-SSSE3-NEXT: shlq $32, %rdx ; SSE2-SSSE3-NEXT: orq %rcx, %rdx ; SSE2-SSSE3-NEXT: movq %rdx, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-SSSE3-NEXT: movd %xmm0, %eax ; SSE2-SSSE3-NEXT: addl %ecx, %eax ; SSE2-SSSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index 863ab4dee1238..19f9155d851a6 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -69,21 +69,21 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract1_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract1_i32_zext_insert0_i64_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: extract1_i32_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq @@ -114,21 +114,21 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: extract2_i32_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq @@ -242,7 +242,7 @@ define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) { define <2 x i64> @extract1_i32_zext_insert1_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract1_i32_zext_insert1_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] @@ -332,7 +332,7 @@ define <2 x i64> @extract3_i32_zext_insert1_i64_undef(<4 x i32> %x) { define <2 x i64> @extract3_i32_zext_insert1_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract3_i32_zext_insert1_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index e428ae8d59194..8758779d97c52 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -792,9 +792,9 @@ define i32 @PR46586(i8* %p, <4 x i32> %v) { ; SSE2-NEXT: movzbl 3(%rdi), %eax ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pinsrw $6, %eax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: xorl %edx, %edx ; SSE2-NEXT: divl %ecx diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll index 2e044548404eb..c2e55cf458b59 100644 --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -31,7 +31,7 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: pmovsxwd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: retq ; @@ -39,9 +39,9 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; AVX1: # %bb.0: ; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5 ; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 @@ -87,7 +87,7 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> % ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: retq ; @@ -411,10 +411,10 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE2-LABEL: example24: ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000 ; SSE2-NEXT: .p2align 4, 0x90 @@ -443,10 +443,10 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE41-LABEL: example24: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000 ; SSE41-NEXT: .p2align 4, 0x90 @@ -459,7 +459,7 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE41-NEXT: packssdw %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 ; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 ; SSE41-NEXT: movdqa %xmm3, dj+4096(%rax) @@ -472,17 +472,17 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind { ; AVX1-LABEL: example24: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body diff --git a/llvm/test/CodeGen/X86/cfi-basic-block-sections-1.ll b/llvm/test/CodeGen/X86/cfi-basic-block-sections-1.ll index 62e669eff9e4e..a5a6d697451fc 100644 --- a/llvm/test/CodeGen/X86/cfi-basic-block-sections-1.ll +++ b/llvm/test/CodeGen/X86/cfi-basic-block-sections-1.ll @@ -1,6 +1,6 @@ -; RUN: llc -O0 %s --basicblock-sections=all -mtriple=x86_64 -filetype=asm --frame-pointer=all -o - | FileCheck --check-prefix=SECTIONS_CFI %s -; RUN: llc -O0 %s --basicblock-sections=all -mtriple=x86_64 -filetype=asm --frame-pointer=none -o - | FileCheck --check-prefix=SECTIONS_NOFP_CFI %s -; RUN: llc -O0 %s --basicblock-sections=all -mtriple=x86_64 -filetype=obj --frame-pointer=all -o - | llvm-dwarfdump --eh-frame - | FileCheck --check-prefix=EH_FRAME %s +; RUN: llc -O0 %s --basic-block-sections=all -mtriple=x86_64 -filetype=asm --frame-pointer=all -o - | FileCheck --check-prefix=SECTIONS_CFI %s +; RUN: llc -O0 %s --basic-block-sections=all -mtriple=x86_64 -filetype=asm --frame-pointer=none -o - | FileCheck --check-prefix=SECTIONS_NOFP_CFI %s +; RUN: llc -O0 %s --basic-block-sections=all -mtriple=x86_64 -filetype=obj --frame-pointer=all -o - | llvm-dwarfdump --eh-frame - | FileCheck --check-prefix=EH_FRAME %s ;; void f1(); ;; void f3(bool b) { diff --git a/llvm/test/CodeGen/X86/cfi-inserter-basic-block-sections-callee-save-registers.ll b/llvm/test/CodeGen/X86/cfi-inserter-basic-block-sections-callee-save-registers.ll index 19725138f6ed9..d87ead0e864cf 100644 --- a/llvm/test/CodeGen/X86/cfi-inserter-basic-block-sections-callee-save-registers.ll +++ b/llvm/test/CodeGen/X86/cfi-inserter-basic-block-sections-callee-save-registers.ll @@ -1,6 +1,6 @@ ;; This test checks if CFI instructions for all callee saved registers are emitted ;; correctly with basic block sections. -; RUN: llc %s -mtriple=x86_64 -filetype=asm --basicblock-sections=all --frame-pointer=all -o - | FileCheck --check-prefix=SECTIONS_CFI %s +; RUN: llc %s -mtriple=x86_64 -filetype=asm --basic-block-sections=all --frame-pointer=all -o - | FileCheck --check-prefix=SECTIONS_CFI %s ; SECTIONS_CFI: _Z3foob: ; SECTIONS_CFI: .cfi_offset %rbp, -16 diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll index b96f44ec3073f..16a993316d7e5 100644 --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -696,7 +696,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { ; SSE2-LABEL: _clearupper16xi8b: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %r10 ; SSE2-NEXT: movq %r10, %r8 ; SSE2-NEXT: shrq $56, %r8 @@ -878,7 +878,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; SSE2-LABEL: _clearupper32xi8b: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %r10 ; SSE2-NEXT: movq %r10, %r8 ; SSE2-NEXT: shrq $56, %r8 diff --git a/llvm/test/CodeGen/X86/cmpxchg8b.ll b/llvm/test/CodeGen/X86/cmpxchg8b.ll index caf40c541e280..998b0ad62fc98 100644 --- a/llvm/test/CodeGen/X86/cmpxchg8b.ll +++ b/llvm/test/CodeGen/X86/cmpxchg8b.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown- -mcpu=core2 | FileCheck %s --check-prefixes=CHECK,X64 ; RUN: llc < %s -mtriple=i686-unknown- -mcpu=i486 | FileCheck %s --check-prefixes=I486 ; RUN: llc < %s -mtriple=i686-unknown- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=i686-unknown- -mcpu=lakemont | FileCheck %s --check-prefixes=CHECK,X86 ; Basic 64-bit cmpxchg define void @t1(i64* nocapture %p) nounwind ssp { diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll index 05c546c411f5b..6ed5fb5f49b6e 100644 --- a/llvm/test/CodeGen/X86/combine-fcopysign.ll +++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll @@ -198,7 +198,7 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float ; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE-NEXT: cvtss2sd %xmm2, %xmm5 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE-NEXT: movaps {{.*#+}} xmm6 = [NaN,NaN] ; SSE-NEXT: cvtss2sd %xmm3, %xmm3 ; SSE-NEXT: movaps %xmm6, %xmm7 @@ -267,7 +267,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl ; SSE-NEXT: orps %xmm5, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: andps %xmm4, %xmm0 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: xorps %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll index b18c7246104d4..17d01e1d3362c 100644 --- a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll @@ -93,7 +93,7 @@ define i32 @movmskps_sext_v4i64(<4 x i32> %a0) { ; AVX1-LABEL: movmskps_sext_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovmskpd %ymm0, %eax @@ -116,7 +116,7 @@ define i32 @movmskps_sext_v8i32(<8 x i16> %a0) { ; AVX1-LABEL: movmskps_sext_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 0e448f3f3be06..27823cf5fe8c5 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -91,9 +91,9 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { ; SSE-LABEL: combine_zext_pmuludq_256: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] ; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] ; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [715827883,715827883] ; SSE-NEXT: pmuludq %xmm4, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 1a52ebfc6cd90..ce411b5e8f06b 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2959,7 +2959,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; SSE41-LABEL: pr38658: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 @@ -2984,7 +2984,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; AVX1-LABEL: pr38658: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 @@ -3058,7 +3058,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; ; XOP-LABEL: pr38658: ; XOP: # %bb.0: -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; XOP-NEXT: vpmovsxbw %xmm1, %xmm1 ; XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll index a6950873daf55..383d1866aa1d9 100644 --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -311,7 +311,7 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -344,7 +344,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -377,7 +377,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) { ; SSE41-NEXT: pmulhuw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index e2f3f2dc7523f..28a73cdb6a41e 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -152,7 +152,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrad %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrad %xmm4, %xmm5 @@ -272,7 +272,7 @@ define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index 36fbdaf537025..2e886defafd4c 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -400,7 +400,7 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index e6d7aac926162..c44342d00357a 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -233,7 +233,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -249,7 +249,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -307,7 +307,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -324,7 +324,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 @@ -384,7 +384,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -401,7 +401,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll index b21ed8ec60cef..cd0b21d02969c 100644 --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -213,7 +213,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrld %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: psrld %xmm5, %xmm6 diff --git a/llvm/test/CodeGen/X86/condbr_if.ll b/llvm/test/CodeGen/X86/condbr_if.ll deleted file mode 100644 index 7b92f712be6dd..0000000000000 --- a/llvm/test/CodeGen/X86/condbr_if.ll +++ /dev/null @@ -1,178 +0,0 @@ -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=sandybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=ivybridge %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=haswell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=broadwell %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=skylake %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu -mcpu=skx %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=MERGE -; RUN: llc -x86-condbr-folding=true -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s --check-prefix=NOTMERGE - -define i32 @length2_1(i32) { - %2 = icmp slt i32 %0, 3 - br i1 %2, label %3, label %5 - -;